js/BatchTranslate.js

/**=====LICENSE STATEMENT START=====
    Translator++ 
    CAT (Computer-Assisted Translation) tools and framework to create quality
    translations and localizations efficiently.
        
    Copyright (C) 2018  Dreamsavior<dreamsavior@gmail.com>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
=====LICENSE STATEMENT END=====*/
/**
 * @typedef {Object} TranslationInfo
 * @property {string} path - Path to the object.
 * @property {number} row - Row number.
 * @property {string} original - Original text, should not be changed.
 * @property {string} toTranslate - Text to be translated.
 * @property {string} [translation=""] - Translated text.
 * @property {Object} [info] - Additional information.
 */
const pLimit = require('p-limit');
const hardLimitConcurrentRequest = 30;
const common = require("www/js/common");

const batchDisplay = function(batch)  {
    if (!batch?.info) return "";
    const color = common.generateRandomHexColor(batch.info.index);
    return `<span style="color:${color}">[${batch.info.index + 1}/${batch.info.total}]</span>`;
}


class BatchItems extends Array {
    constructor(...args) {
        super(...args);
        Object.defineProperty(this, 'info', {
            value: {},
            writable: true,
            configurable: true,
            enumerable: false
        });
    }
}

/**
 * BatchTranslate class for handling batch translation.
 * @class
 * @example
 * 
```js
var BatchTranslate = require("www/js/BatchTranslate");
var batchTranslate = new BatchTranslate(trans.google, {
    ignoreTranslated: true,
});
await batchTranslate.batchTranslate();
```
 */
class BatchTranslate {
    constructor(translator, options) {
        this.translator = translator;
        this.options = options || {};
        this.info = {
            startAt: undefined
        };
        this.defaultFlow = [
            "translateWithReference",
            "beforeTranslateCommonRoutine",
            "translate",
            "afterTranslateCommonRoutine",
            "applyToGrid",
            "displayResult",
            "batchDelay"
        ];

        this.debugLevel = 0;
    }
}

BatchTranslate.beforeTranslateCommonRoutines = {
    /**
     * Collect context from the previous row
     * @param {BatchItems} batch 
     */
    contextCollector: async function (batch) {
        console.log("%cCollecting context...", "color:orange", arguments, this);
        // take the very first entry of the batch
        if (!batch?.length) return batch;

        // if context is disabled, skip
        if (!this.translator.getOptions('enableContext')) return batch;
        console.log("%cContext enabled", "color:orange");

        const contextCharLimit = this.translator.getOptions('contextCharacterLimit') || 2048;
        const firstItem = batch[0];
        const contexts = [];
        let charCount = 0;
        const transData = trans.getData(firstItem.path);
        for (let row = firstItem.row - 1; row >= 0; row--) {
            let cells = transData[row];
            if (!cells) continue;
            // calculate the char count
            const translation = trans.getTranslationFromRow(cells, trans.keyColumn) || "";
            charCount += cells[trans.keyColumn].length;
            charCount += translation.length;
            //console.log("---charCount: ", charCount, ", contextCharLimit: ", contextCharLimit);
            if (charCount > contextCharLimit) break;
            if (translation) {
                contexts.push({
                    original: cells[trans.keyColumn],
                    translation: translation
                })
            } else {
                contexts.push({
                    original: cells[trans.keyColumn]
                })
            }

        }
        // invert the context
        contexts.reverse();
        
        console.log("%cContexts:", "color:orange", contexts);
        // store the context into the batch.info.contexts
        batch.info.contexts = contexts;
        batch.info.contextsToString = function(format="json") {
            if (format == "original") {
                // return the list of original text
                return contexts.map((item) => item.original).join("\n");
            } else {
                return "```json\n"+JSON.stringify(this.contexts, null, 2)+"\n```";
            }

        }

        return batch;
    }
};

BatchTranslate.afterTranslateCommonRoutines = {};

BatchTranslate.addBeforeTranslateCommonRoutine = function (id, routine) {
    BatchTranslate.beforeTranslateCommonRoutines[id] = routine;
}

BatchTranslate.addAfterTranslateCommonRoutine = function (id, routine) {
    BatchTranslate.afterTranslateCommonRoutines[id] = routine;
}

BatchTranslate.removeBeforeTranslateCommonRoutine = function (id) {
    delete BatchTranslate.beforeTranslateCommonRoutines[id];
}

BatchTranslate.removeAfterTranslateCommonRoutine = function (id) {
    delete BatchTranslate.afterTranslateCommonRoutines[id];
}


/**
 * Get the actor glossary data.
 * @param {string} actorName - The actor name to get the glossary data.
 * @param {Object} options - The options for getting the actor glossary data.
 * @returns {Object} The actor glossary data.
 */
BatchTranslate.getActorGlossaryData = function(actorName, options={}) {
    console.log("%cgetActorGlossaryData", "color:cyan", actorName, options);
    if (!options.translatorOptions?.useActorReference) return;
    if (options.actorGlossaryData) {
        if (actorName) return options.actorGlossaryData?.actors[actorName];
        return options.actorGlossaryData; // use cached data
    }
    const actorReferencePath = options.translatorOptions?.actorReferencePath || "Actor Reference";
    // load the actor glossary data
    const reference = trans.getObjectById(actorReferencePath);
    if (!reference) return;

    const result = {
        actors: {}
    }

    if (!reference.data) return;

    for (let i=0; i<reference.data.length; i++) {
        let thisRow = reference.data[i];
        let actorName = thisRow[trans.keyColumn];
        result.actors[actorName] = {};
        result.actors[actorName].name           = actorName;
        result.actors[actorName].translation    = thisRow?.[options.translatorOptions.actorReferenceTranslationColumn];
        result.actors[actorName].info         = thisRow?.[options.translatorOptions.actorReferenceInfoColumn];
    }

    if (Object.keys(result.actors).length == 0) return;

    if (actorName) {
        return result.actors[actorName];
    }

    options.actorGlossaryData = result; // cache the data
    return result;
}

BatchTranslate.getMentionedActor = function(messages=[], options={}) {
    console.log("%cgetMentionedActor", "color:green", messages, options);
    const resultLines = [];
    if (!options.translatorOptions?.useActorReference) {
        console.log("actor reference is not used");
        return "";
    }

    const mentionedActors = {};
    // register from batchInfo
    console.log("Register from batchInfo");
    for (let i of options.batchInfo) {
        if (i.actorOriginal) mentionedActors[i.actorOriginal] = BatchTranslate.getActorGlossaryData(i.actorOriginal, options);
    }

    // register from messages
    const actorGlossaryData = BatchTranslate.getActorGlossaryData(undefined, options);
    console.log("actorGlossaryData", actorGlossaryData);
    let joinedMessages = messages.join("\n");
    for (let actorName in actorGlossaryData.actors) {
        if (joinedMessages.includes(actorName)) mentionedActors[actorName] = actorGlossaryData.actors[actorName];
    }

    console.log("mentionedActors", mentionedActors);
    // compile into record of line with format: actorName (translation) - Info
    for (let actorName in mentionedActors) {
        let thisActor = mentionedActors[actorName];
        if (!thisActor) continue;
        console.log("thisActor", thisActor);
        let translationBlock = thisActor?.translation ? ` (${thisActor.translation})` : "";
        let infoBlock = thisActor?.info ? ` - ${thisActor.info}` : "";
        let actorInfo = `${actorName}${translationBlock}${infoBlock}`;
        resultLines.push(actorInfo);
    }

    return resultLines.join("\n");
}


/**
 * Collection of the default procedures for batch translation.
 * Procedures are functions that handle the translation process in the translation flow.
 * A procedure is an async function that accept one parameter, the batch of translation data and returns batch translation info.
 */
BatchTranslate.procedures = {
    /**
     * Translate the given batch translation data with reference.
     * @param {BatchItems} batch - The batch of translation data.
     * @returns {Promise<BatchItems>} A promise that resolves to the translated batch.
     */
    translateWithReference: async function(batch) {
        if (!batch?.length) return batch;
        if (this.translator?.skipReferencePair) return batch;

        ui.log(`${batchDisplay(batch)} Translating with reference...`);
        let texts = batch.map((item) => item.toTranslate);
        let translations = await trans.translateByReference(texts);
        console.log("Translations:", translations);
        // assign back the translation to the batch
        for (let i = 0; i < batch.length; i++) {
            batch[i].toTranslate = translations?.[i] || batch[i].toTranslate;
        }

        return batch;
    },

    /**
     * Before translate common routine.
     * Contains a list of default routines to be executed before translation.
     * @param {BatchItems} batch - The batch of translation data.
     * @returns {Promise<BatchItems>} A promise that resolves to the translated batch.
     */
    beforeTranslateCommonRoutine: async function(batch) {
        for (let id in BatchTranslate.beforeTranslateCommonRoutines) {
            let routine = BatchTranslate.beforeTranslateCommonRoutines[id];
            if (typeof routine != "function") continue;
            await routine.call(this, batch);
        }
        return batch;
    },

    /**
     * After translate common routine.
     * Contains a list of default routines to be executed after translation.
     * @param {BatchItems} batch - The batch of translation data.
     * @returns {Promise<BatchItems>} A promise that resolves to the translated batch.
     */
    afterTranslateCommonRoutine: async function(batch) {
        for (let id in BatchTranslate.afterTranslateCommonRoutines) {
            let routine = BatchTranslate.afterTranslateCommonRoutines[id];
            if (typeof routine != "function") continue;
            await routine.call(this, batch);
        }
        return batch;
    },

    /**
     * Translate the given batch of translation data.
     * If translation is already available, skip the translation.
     * @param {BatchItems[]} batch - The batch of translation data.
     * @returns {Promise<BatchItems[]>} A promise that resolves to the translated batch.
     */
    // translate: async function(batch) {
    //     if (!batch?.length) return batch;
        
    //     let texts = batch.map((item) => item.toTranslate);
    //     let translations = await this.translator.translate(texts);
    //     console.log("Translations:", translations);
    //     // assign back the translation to the batch
    //     for (let i = 0; i < batch.length; i++) {
    //         batch[i].translation = translations?.translation?.[i];
    //     }
    
    //     console.log("Translated batch:", batch);
    //     return batch;
    // },
    translate: async function(batch) {
        if (!batch?.length) return batch;

        // Filter the items that need translation
        const itemsToTranslate = batch.filter(item => !item.translation);
        const texts = itemsToTranslate.map(item => item.toTranslate);
    
        // Perform translation only if there are texts to translate
        if (texts.length > 0) {
            console.log("%cTranslating texts:", "color:green;", texts);
            const percentage = ((batch.info.index + 1) / batch.info.total) * 100;
            await ui.log.progress(percentage, `Translating batch ${batch.info.index+1}/${batch.info.total}...`);
            const translations = await this.translator.translate(texts, {...this.options, ...{batchInfo: batch}});
            console.log("Translations:", translations);
    
            // Assign translations back to the appropriate items in the batch
            itemsToTranslate.forEach((item, index) => {
                item.translation = translations?.translation?.[index];
            });
        } else {
            console.log("No texts to translate in this batch, probably prefilled with cache");
            if (typeof this.translator.onEmptyBatch == "function") {
                await this.translator.onEmptyBatch(batch);
            }
        }

        console.log("Translated batch:", batch);
        return batch;
    },

    /**
     * Apply the translation to the grid.
     * @param {BatchItems[]} batch - The batch of translation data.
     * @returns {Promise<BatchItems[]>} A promise that resolves to the translated batch.
     */
    applyToGrid: async function(batch) {
        if (!batch?.length) return batch;
        await ui.log(`${batchDisplay(batch)} Applying translation into the grid...`);
        for (let item of batch) {
            const table = trans.getData(item.path);
            if (!table) continue;
            if (!table[item.row]) continue;

            if (this.options.overwrite == false) {
                if (table[item.row][this.options.targetColumn]) {
                    continue;
                }
            }

            table[item.row][this.options.targetColumn] = item.translation;
            item.info.isApplied = true;
        }
        return batch;
    },

    displayResult: async function(batch) {
        console.log("Translated batch:", batch);
        return batch;
    },

    /**
     * Delay the execution for the next batch translation process, if any.
     * This procedure is useful for rate limiting the translation process.
     * Some endpoints may have rate limits, so this procedures will keeps you from being banned.
     * @param {BatchItems[]} batch - The batch of translation data.
     * @returns {Promise<BatchItems[]>} A promise that resolves to the translated batch.
     */
    batchDelay: async function(batch) {
        // delay the batch if it is not the last batch
        if (batch.info.index < batch.info.total - 1) {
            const batchDelay = this.translator.getOptions('batchDelay') || 1000;
            await ui.log(`${batchDisplay(batch)} Waiting for ${batchDelay} ms...`);
            await common.wait(batchDelay);
        }
        return batch;
    }

};

/**
 * Define a procedure for batch translation.
 * @param {string} procedureName - The name of the procedure.
 * @param {Function} procedure - The procedure function.
 */
BatchTranslate.defineProcedure = function (procedureName, procedure) {
    BatchTranslate.procedures[procedureName] = procedure;
}


/**
 * Run the procedure for the given batch.
 * @param {string} procedureName - The name of the procedure.
 * @param {TranslationInfo[]} batch - The batch of translation data.
 * @returns {Promise<TranslationInfo[]>} A promise that resolves to the translated batch.
 * @throws {Error} Throws an error if the procedure is not found.
 */
BatchTranslate.prototype.runProcedure = async function (procedureName, batch) {
    if (typeof BatchTranslate.procedures[procedureName] == "function") {
        // call the procedure function and ensure it returns the batch
        return (await BatchTranslate.procedures[procedureName].call(this, batch)) || batch;
    }

    throw new Error("Procedure " + procedureName + " not found.");
}

BatchTranslate.prototype.getFlow = async function () {
    return this.translator.getOptions('translationFlow') || this.defaultFlow;
}

/**
 * Generates translation batch data for the given translator and options.
 *
 * @async
 * @function
 * @param {Object} translator - The translator engine to be used for translation.
 * @param {Object} options - The options for generating the translation batch data.
 * @param {Function} [options.onFinished=function() {}] - Callback function to be executed when the process finishes.
 * @param {number} [options.keyColumn=0] - The column index containing the keys to be translated.
 * @param {boolean} [options.translateOther=false] - Flag indicating whether to translate other columns.
 * @param {boolean} [options.ignoreTranslated=false] - Flag indicating whether to ignore already translated rows.
 * @param {boolean} [options.overwrite=false] - Flag indicating whether to overwrite existing translations.
 * @param {boolean} [options.saveOnEachBatch=false] - Flag indicating whether to save after each batch.
 * @param {boolean} [options.alwaysSparateFile=false] - Flag indicating whether to always translate each file separately.
 * @param {string} [options.filterTagMode] - Mode for filtering tags ('blacklist' or 'whitelist').
 * @param {string[]} [options.filterTag] - Array of tags to filter rows.
 * @param {number} [options.targetColumn=options.keyColumn+1] - The column index where translations will be placed.
 * @param {number} [options.maxRequestLength=5000] - Maximum length of the request to the translator engine.
 * @param {string[]} [options.files] - Array of files to be processed. If not provided, defaults to checked or all files.
 * @param {object} [options.translatiorOptions] - Additional options for the translator engine.
 * @param {Object} trans - The current translation instance.
 * @returns {Promise<Array<Array<TranslationInfo>>>} A promise that resolves to a two-dimensional array containing the translation batch data.
 * @throws {Error} Throws an error if the translator engine is invalid.
 */
BatchTranslate.prototype.getTranslationBatchData = async function* (translator=this.translator, options=this.options) {
    translator = trans.getTranslatorEngine(translator);
    if (!translator) throw new Error("Invalid translator engine " + translator);
    options ||= {};
    options.onFinished ||= function () { };
    options.keyColumn = options.keyColumn || trans.keyColumn || 0;
    options.translatiorOptions ||= {};
    options.translateOther ||= false;
    options.ignoreTranslated ||= false;
    options.overwrite ||= false;
    options.saveOnEachBatch ||= false;
    options.alwaysSparateFile = options.alwaysSparateFile || options.translatiorOptions?.alwaysSparateFile || false;
    options.filterTagMode ||= undefined;
    options.filterTag ||= undefined;
    options.targetColumn ||= options.keyColumn + 1;
    options.maxRequestLength = options.maxRequestLength || translator.getOptions('maxRequestLength') || 5000;
    options.rowLimitPerBatch = options.rowLimitPerBatch || translator.getOptions('rowLimitPerBatch') || 1000;

    options.files ||= trans.getCheckedFiles();
    if (!options.files?.length) {
        options.files = trans.getAllFiles();
    }

    console.log("BatchTranslate options:", options);

    // Generator function to yield translation batches
    var currentBatches;
    var originalTexts;
    const resetBatches = () => {
        currentBatches = new BatchItems();
        originalTexts = [];
    }
    // initialize the batch
    resetBatches();

    for (let file of options.files) {
        if (options.alwaysSparateFile) {
            if (currentBatches.length) yield currentBatches;
            resetBatches();
        }

        console.log("Processing file", file);
        let currentData = trans.getData(file);
        if (!currentData?.length) continue;

        for (let i = 0; i < currentData.length; i++) {
            let thisRow = currentData[i];
            let currentSentence = thisRow[options.keyColumn];
            if (!thisRow?.[trans.keyColumn]) continue;
            if (currentSentence.trim().length == 0) continue;

            // skip according to tags
            if (options.filterTagMode == "blacklist") {
                if (trans.hasTags(options.filterTag, i, file)) continue;
            } else if (options.filterTagMode == "whitelist") {
                if (!trans.hasTags(options.filterTag, i, file)) continue;
            }

            // skip line that already translated
            if (options.ignoreTranslated) {
                if (trans.rowHasTranslation(currentData[i], options.keyColumn)) continue;
            }

            // skip line if cell is not empty & overwrite option is false
            if (options.overwrite == false) {
                if (thisRow[options.targetColumn]) {
                    continue;
                }
            }

            let translationInfo = {
                row: i,
                original: currentSentence,
                toTranslate: currentSentence,
                path: file,
                translation: "",
                info: {}
            }

            let escapedSentence = translator.escapeCharacter(currentSentence);
            originalTexts.push(escapedSentence);

            // calculate the current request length vs maxRequestLength
            if (originalTexts.join("").length >= options.maxRequestLength) {
                yield currentBatches;
                resetBatches();
            } else if (currentBatches.length >= options.rowLimitPerBatch) {
                yield currentBatches;
                resetBatches();
            }
            // store translationInfo into currentBatches
            currentBatches.push(translationInfo);
        }
    }
    if (currentBatches.length) yield currentBatches;
}

/**
 * Translate the given batch of translation data.
 * @param {TranslationInfo[]} batch 
 */
BatchTranslate.prototype.translate = async function(batch) {
    if (!batch?.length) return batch;
    
    let texts = batch.map((item) => item.original);
    let translations = await this.translator.translate(texts);
    console.log("Translations:", translations);
    // assign back the translation to the batch
    for (let i = 0; i < batch.length; i++) {
        batch[i].translation = translations?.translation?.[i];
    }

    console.log("Translated batch:", batch);
    return batch;
}

BatchTranslate.prototype.abort = async function() {
    if (!this.reject) return;
    this.aborted = true;
    this.reject("Aborted");
    await ui.log.progress(100, `Aborted`);
    trans.refreshGrid();
}

BatchTranslate.prototype.translateAll = async function(translator=this.translator, options=this.options) {
    console.log("%cCalling batchTranslate with translator", "color:aqua", translator, "and options", options);
    console.log("Calculating batch length...");

    let maxConcurrentRequest = options.maxConcurrentRequest || this.translator.getOptions('maxConcurrentRequest') || 1;
    // set hard limit of maxConcurrentRequest to hardLimitConcurrentRequest
    if (maxConcurrentRequest > hardLimitConcurrentRequest) {
        await ui.log(`Your concurrent request is capped at ${hardLimitConcurrentRequest}.`);
        maxConcurrentRequest = hardLimitConcurrentRequest;
    }
    const limit = pLimit(maxConcurrentRequest);

    this.info.startAt = Date.now();
    this.info.batchOptions = options;

    // generating resolver
    new Promise((resolve, reject) => {
        this.resolve = resolve;
        this.reject = reject;
    });

    // call the generator function to get the batchLength
    let batchLength = 0;
    for await (const batch of this.getTranslationBatchData(translator, options)) {
        batchLength++;
    }

    await ui.log("Total batch length: " + batchLength);
    await ui.log("Number of concurrent request: " + maxConcurrentRequest);
    if (batchLength > 1) {
        await ui.log("Batch will be processed concurrently. The log of each batch will be displayed not in ordered manner... that's normal!");
    }
    this.info.totalBatch = batchLength;

    const flows = await this.getFlow();
    console.log("Current procedure:", flows);

    // Use the generator function

    const processOneBatch = async (batch) => {
        for (let procedure of flows) {
            if (this.aborted) return;
            await this.runProcedure(procedure, batch);
        }
        if (this.aborted) return;
    };

    // Process batches with p-limit
    const tasks = [];
    let batchIndex = 0;
    for await (const batch of this.getTranslationBatchData(translator, options)) {
        batch.info.total = batchLength;
        batch.info.index = batchIndex;
        console.log('Processing batch:', batch);
        tasks.push(limit(() => processOneBatch(batch)));
        batchIndex++;
    }
    
    // Wait for all tasks to complete
    await Promise.all(tasks);

    await ui.log.progress(100, `Completed`);
    console.log('All batches processed.');

    trans.refreshGrid();
    this.resolve();
};


module.exports = BatchTranslate;