Init

2025-09-30 20:43:27 +02:00
commit 1acfc22112
8 changed files with 124 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.venv/
+utils/__pycache__/
--- a/1-Input/.gitignore
+++ b/1-Input/.gitignore
@@ -0,0 +1 @@
+*.ass
--- a/2-Output/.gitignore
+++ b/2-Output/.gitignore
@@ -0,0 +1 @@
+*.ass
--- a/AutoTrans.py
+++ b/AutoTrans.py
@@ -0,0 +1,33 @@
+import ass
+import os
+
+from utils.preprocessing import replace_english_words, remove_ass_statements, remove_new_line_statements, split_sentence
+from utils.translator import translator
+
+LANGUAGES_DICT = {
+    'deu': 'de',
+    'spa': 'es',
+    'fra': 'fr',
+    'hin': 'hi',
+    'por': 'pt',
+    'rus': 'ru'
+}
+
+INPUT_FILE = os.path.join('1-Input', 'eng.ass')
+
+for lang in list(LANGUAGES_DICT.keys()):
+    # Read ass file
+    
+    ass_file = open(INPUT_FILE, 'r', encoding='utf_8_sig')
+    ass_doc = ass.parse(ass_file)
+
+    # Parse subtitles
+    for event in ass_doc.events:
+        event.text = remove_ass_statements(event.text)
+        event.text = remove_new_line_statements(event.text)
+        event.text = replace_english_words(event.text)
+        event.text = translator(event.text, lang)
+
+    out_file = os.path.join('2-Output', f'{LANGUAGES_DICT[lang]}.ass')
+    with open(out_file, 'w', encoding='utf_8_sig') as output_file:
+        ass_doc.dump_file(output_file)
--- a/Readme.md
+++ b/Readme.md
@@ -0,0 +1,28 @@
+
+### Setup
+
+1. Create Python venv
+    ```bash
+    # Python 3.10 is recommended
+    python3.10 -m venv .venv
+
+    # Enable venv
+    source .venv/bin/activate
+    ```
+
+2. Install dependencies
+    ```bash
+    # Required for torch
+    pip install typing-extensions
+
+    # Important to use the cuda version
+    pip install torch torchvision --index-url https://download.pytorch.org/whl/cu12
+
+    # Other
+    pip install transformers sentencepiece sacremoses protobuf
+    ```
+
+### Usage
+
+1. Place `eng.ass` inside `1-Input` folder
+2. Run `AutoTrans.py` with Python in terminal (don't forget to enable venv)
--- a/Setup.PS1
+++ b/Setup.PS1
@@ -0,0 +1,11 @@
+
+python -m venv .venv
+
+.\.venv\Scripts\Activate.ps1
+
+pip install --no-cache-dir ass
+pip install --no-cache-dir --pre -U torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu126
+pip install --no-cache-dir transformers
+pip install --no-cache-dir sentencepiece
+pip install --no-cache-dir sacremoses
+pip install --no-cache-dir protobuf
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -0,0 +1,33 @@
+import re
+import math
+
+
+# List of words which might lead to inaccurate translation
+DICTIONARY = {
+    'violated': 'raped',
+    'semen':    'cum'
+}
+
+def replace_english_words(text: str) -> str:
+    return ' '.join(DICTIONARY.get(word, word) for word in text.split())
+
+def remove_ass_statements(text: str) -> str:
+    # Regular expression to match everything between curly braces (non-greedy)
+    return re.sub(r'\{.*?\}', '', text)
+
+def remove_new_line_statements(text: str) -> str:
+    return text.replace("\\N", " ")
+
+def split_sentence(sentence, n, translated_sentence):
+    original_commas = sentence.count(',')
+    translated_commas = translated_sentence.count(',')
+
+    # Split by commas if commas align
+    if original_commas != 0 and original_commas == translated_commas:
+        return [chunk.strip() for chunk in translated_sentence.split(',')]
+    
+    # Otherwise, split into n roughly equal chunks
+    words = translated_sentence.split()
+    chunk_size = math.ceil(len(words) / n)
+    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    return chunks
--- a/utils/translator.py
+++ b/utils/translator.py
@@ -0,0 +1,15 @@
+from transformers import AutoProcessor, SeamlessM4Tv2Model
+
+processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
+model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
+
+def translator(subtitle_line: str, target_language: str) -> str:
+    # Ensure the model and processor are loaded to the GPU
+    model.to('cuda')
+    # Move input tensors to GPU
+    text_inputs = processor(text=subtitle_line, src_lang="eng", return_tensors="pt")
+    text_inputs = {key: value.to('cuda') for key, value in text_inputs.items()}
+    # Generate output tokens on GPU
+    output_tokens = model.generate(**text_inputs, tgt_lang=target_language, num_beams=5, generate_speech=False)
+    # Decode the result
+    return processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)