Init

2025-09-30 20:43:27 +02:00
commit 1acfc22112
8 changed files with 124 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 .venv/
 utils/__pycache__/
--- a/1-Input/.gitignore
+++ b/1-Input/.gitignore
@@ -0,0 +1 @@
 *.ass
--- a/2-Output/.gitignore
+++ b/2-Output/.gitignore
@@ -0,0 +1 @@
 *.ass
--- a/AutoTrans.py
+++ b/AutoTrans.py
@@ -0,0 +1,33 @@
 import ass
 import os
 from utils.preprocessing import replace_english_words, remove_ass_statements, remove_new_line_statements, split_sentence
 from utils.translator import translator
 LANGUAGES_DICT = {
    'deu': 'de',
    'spa': 'es',
    'fra': 'fr',
    'hin': 'hi',
    'por': 'pt',
    'rus': 'ru'
 }
 INPUT_FILE = os.path.join('1-Input', 'eng.ass')
 for lang in list(LANGUAGES_DICT.keys()):
    # Read ass file
    ass_file = open(INPUT_FILE, 'r', encoding='utf_8_sig')
    ass_doc = ass.parse(ass_file)
    # Parse subtitles
    for event in ass_doc.events:
        event.text = remove_ass_statements(event.text)
        event.text = remove_new_line_statements(event.text)
        event.text = replace_english_words(event.text)
        event.text = translator(event.text, lang)
    out_file = os.path.join('2-Output', f'{LANGUAGES_DICT[lang]}.ass')
    with open(out_file, 'w', encoding='utf_8_sig') as output_file:
        ass_doc.dump_file(output_file)
--- a/Readme.md
+++ b/Readme.md
@@ -0,0 +1,28 @@
 ### Setup
 1. Create Python venv
    ```bash
    # Python 3.10 is recommended
    python3.10 -m venv .venv
    # Enable venv
    source .venv/bin/activate
    ```
 2. Install dependencies
    ```bash
    # Required for torch
    pip install typing-extensions
    # Important to use the cuda version
    pip install torch torchvision --index-url https://download.pytorch.org/whl/cu12
    # Other
    pip install transformers sentencepiece sacremoses protobuf
    ```
 ### Usage
 1. Place `eng.ass` inside `1-Input` folder
 2. Run `AutoTrans.py` with Python in terminal (don't forget to enable venv)
--- a/Setup.PS1
+++ b/Setup.PS1
@@ -0,0 +1,11 @@
 python -m venv .venv
 .\.venv\Scripts\Activate.ps1
 pip install --no-cache-dir ass
 pip install --no-cache-dir --pre -U torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu126
 pip install --no-cache-dir transformers
 pip install --no-cache-dir sentencepiece
 pip install --no-cache-dir sacremoses
 pip install --no-cache-dir protobuf
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -0,0 +1,33 @@
 import re
 import math
 # List of words which might lead to inaccurate translation
 DICTIONARY = {
    'violated': 'raped',
    'semen':    'cum'
 }
 def replace_english_words(text: str) -> str:
    return ' '.join(DICTIONARY.get(word, word) for word in text.split())
 def remove_ass_statements(text: str) -> str:
    # Regular expression to match everything between curly braces (non-greedy)
    return re.sub(r'\{.*?\}', '', text)
 def remove_new_line_statements(text: str) -> str:
    return text.replace("\\N", " ")
 def split_sentence(sentence, n, translated_sentence):
    original_commas = sentence.count(',')
    translated_commas = translated_sentence.count(',')
    # Split by commas if commas align
    if original_commas != 0 and original_commas == translated_commas:
        return [chunk.strip() for chunk in translated_sentence.split(',')]
    # Otherwise, split into n roughly equal chunks
    words = translated_sentence.split()
    chunk_size = math.ceil(len(words) / n)
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks
--- a/utils/translator.py
+++ b/utils/translator.py
@@ -0,0 +1,15 @@
 from transformers import AutoProcessor, SeamlessM4Tv2Model
 processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
 model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
 def translator(subtitle_line: str, target_language: str) -> str:
    # Ensure the model and processor are loaded to the GPU
    model.to('cuda')
    # Move input tensors to GPU
    text_inputs = processor(text=subtitle_line, src_lang="eng", return_tensors="pt")
    text_inputs = {key: value.to('cuda') for key, value in text_inputs.items()}
    # Generate output tokens on GPU
    output_tokens = model.generate(**text_inputs, tgt_lang=target_language, num_beams=5, generate_speech=False)
    # Decode the result
    return processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)