commit 1acfc221120e1c7c9e0d2f6ed28f82e6b7bf2c54 Author: w33b Date: Tue Sep 30 20:43:27 2025 +0200 Init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f0fffdd --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.venv/ +utils/__pycache__/ diff --git a/1-Input/.gitignore b/1-Input/.gitignore new file mode 100644 index 0000000..40def58 --- /dev/null +++ b/1-Input/.gitignore @@ -0,0 +1 @@ +*.ass \ No newline at end of file diff --git a/2-Output/.gitignore b/2-Output/.gitignore new file mode 100644 index 0000000..40def58 --- /dev/null +++ b/2-Output/.gitignore @@ -0,0 +1 @@ +*.ass \ No newline at end of file diff --git a/AutoTrans.py b/AutoTrans.py new file mode 100644 index 0000000..58b82e0 --- /dev/null +++ b/AutoTrans.py @@ -0,0 +1,33 @@ +import ass +import os + +from utils.preprocessing import replace_english_words, remove_ass_statements, remove_new_line_statements, split_sentence +from utils.translator import translator + +LANGUAGES_DICT = { + 'deu': 'de', + 'spa': 'es', + 'fra': 'fr', + 'hin': 'hi', + 'por': 'pt', + 'rus': 'ru' +} + +INPUT_FILE = os.path.join('1-Input', 'eng.ass') + +for lang in list(LANGUAGES_DICT.keys()): + # Read ass file + + ass_file = open(INPUT_FILE, 'r', encoding='utf_8_sig') + ass_doc = ass.parse(ass_file) + + # Parse subtitles + for event in ass_doc.events: + event.text = remove_ass_statements(event.text) + event.text = remove_new_line_statements(event.text) + event.text = replace_english_words(event.text) + event.text = translator(event.text, lang) + + out_file = os.path.join('2-Output', f'{LANGUAGES_DICT[lang]}.ass') + with open(out_file, 'w', encoding='utf_8_sig') as output_file: + ass_doc.dump_file(output_file) diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..b826a22 --- /dev/null +++ b/Readme.md @@ -0,0 +1,28 @@ + +### Setup + +1. Create Python venv + ```bash + # Python 3.10 is recommended + python3.10 -m venv .venv + + # Enable venv + source .venv/bin/activate + ``` + +2. Install dependencies + ```bash + # Required for torch + pip install typing-extensions + + # Important to use the cuda version + pip install torch torchvision --index-url https://download.pytorch.org/whl/cu12 + + # Other + pip install transformers sentencepiece sacremoses protobuf + ``` + +### Usage + +1. Place `eng.ass` inside `1-Input` folder +2. Run `AutoTrans.py` with Python in terminal (don't forget to enable venv) \ No newline at end of file diff --git a/Setup.PS1 b/Setup.PS1 new file mode 100644 index 0000000..134ad8d --- /dev/null +++ b/Setup.PS1 @@ -0,0 +1,11 @@ + +python -m venv .venv + +.\.venv\Scripts\Activate.ps1 + +pip install --no-cache-dir ass +pip install --no-cache-dir --pre -U torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu126 +pip install --no-cache-dir transformers +pip install --no-cache-dir sentencepiece +pip install --no-cache-dir sacremoses +pip install --no-cache-dir protobuf \ No newline at end of file diff --git a/utils/preprocessing.py b/utils/preprocessing.py new file mode 100644 index 0000000..408f56c --- /dev/null +++ b/utils/preprocessing.py @@ -0,0 +1,33 @@ +import re +import math + + +# List of words which might lead to inaccurate translation +DICTIONARY = { + 'violated': 'raped', + 'semen': 'cum' +} + +def replace_english_words(text: str) -> str: + return ' '.join(DICTIONARY.get(word, word) for word in text.split()) + +def remove_ass_statements(text: str) -> str: + # Regular expression to match everything between curly braces (non-greedy) + return re.sub(r'\{.*?\}', '', text) + +def remove_new_line_statements(text: str) -> str: + return text.replace("\\N", " ") + +def split_sentence(sentence, n, translated_sentence): + original_commas = sentence.count(',') + translated_commas = translated_sentence.count(',') + + # Split by commas if commas align + if original_commas != 0 and original_commas == translated_commas: + return [chunk.strip() for chunk in translated_sentence.split(',')] + + # Otherwise, split into n roughly equal chunks + words = translated_sentence.split() + chunk_size = math.ceil(len(words) / n) + chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] + return chunks diff --git a/utils/translator.py b/utils/translator.py new file mode 100644 index 0000000..4dac2dc --- /dev/null +++ b/utils/translator.py @@ -0,0 +1,15 @@ +from transformers import AutoProcessor, SeamlessM4Tv2Model + +processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") +model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large") + +def translator(subtitle_line: str, target_language: str) -> str: + # Ensure the model and processor are loaded to the GPU + model.to('cuda') + # Move input tensors to GPU + text_inputs = processor(text=subtitle_line, src_lang="eng", return_tensors="pt") + text_inputs = {key: value.to('cuda') for key, value in text_inputs.items()} + # Generate output tokens on GPU + output_tokens = model.generate(**text_inputs, tgt_lang=target_language, num_beams=5, generate_speech=False) + # Decode the result + return processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)