Init
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
.venv/
|
||||||
|
utils/__pycache__/
|
1
1-Input/.gitignore
vendored
Normal file
1
1-Input/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
*.ass
|
1
2-Output/.gitignore
vendored
Normal file
1
2-Output/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
*.ass
|
33
AutoTrans.py
Normal file
33
AutoTrans.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import ass
|
||||||
|
import os
|
||||||
|
|
||||||
|
from utils.preprocessing import replace_english_words, remove_ass_statements, remove_new_line_statements, split_sentence
|
||||||
|
from utils.translator import translator
|
||||||
|
|
||||||
|
LANGUAGES_DICT = {
|
||||||
|
'deu': 'de',
|
||||||
|
'spa': 'es',
|
||||||
|
'fra': 'fr',
|
||||||
|
'hin': 'hi',
|
||||||
|
'por': 'pt',
|
||||||
|
'rus': 'ru'
|
||||||
|
}
|
||||||
|
|
||||||
|
INPUT_FILE = os.path.join('1-Input', 'eng.ass')
|
||||||
|
|
||||||
|
for lang in list(LANGUAGES_DICT.keys()):
|
||||||
|
# Read ass file
|
||||||
|
|
||||||
|
ass_file = open(INPUT_FILE, 'r', encoding='utf_8_sig')
|
||||||
|
ass_doc = ass.parse(ass_file)
|
||||||
|
|
||||||
|
# Parse subtitles
|
||||||
|
for event in ass_doc.events:
|
||||||
|
event.text = remove_ass_statements(event.text)
|
||||||
|
event.text = remove_new_line_statements(event.text)
|
||||||
|
event.text = replace_english_words(event.text)
|
||||||
|
event.text = translator(event.text, lang)
|
||||||
|
|
||||||
|
out_file = os.path.join('2-Output', f'{LANGUAGES_DICT[lang]}.ass')
|
||||||
|
with open(out_file, 'w', encoding='utf_8_sig') as output_file:
|
||||||
|
ass_doc.dump_file(output_file)
|
28
Readme.md
Normal file
28
Readme.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
1. Create Python venv
|
||||||
|
```bash
|
||||||
|
# Python 3.10 is recommended
|
||||||
|
python3.10 -m venv .venv
|
||||||
|
|
||||||
|
# Enable venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install dependencies
|
||||||
|
```bash
|
||||||
|
# Required for torch
|
||||||
|
pip install typing-extensions
|
||||||
|
|
||||||
|
# Important to use the cuda version
|
||||||
|
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu12
|
||||||
|
|
||||||
|
# Other
|
||||||
|
pip install transformers sentencepiece sacremoses protobuf
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
1. Place `eng.ass` inside `1-Input` folder
|
||||||
|
2. Run `AutoTrans.py` with Python in terminal (don't forget to enable venv)
|
11
Setup.PS1
Normal file
11
Setup.PS1
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
|
||||||
|
python -m venv .venv
|
||||||
|
|
||||||
|
.\.venv\Scripts\Activate.ps1
|
||||||
|
|
||||||
|
pip install --no-cache-dir ass
|
||||||
|
pip install --no-cache-dir --pre -U torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||||
|
pip install --no-cache-dir transformers
|
||||||
|
pip install --no-cache-dir sentencepiece
|
||||||
|
pip install --no-cache-dir sacremoses
|
||||||
|
pip install --no-cache-dir protobuf
|
33
utils/preprocessing.py
Normal file
33
utils/preprocessing.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import re
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
# List of words which might lead to inaccurate translation
|
||||||
|
DICTIONARY = {
|
||||||
|
'violated': 'raped',
|
||||||
|
'semen': 'cum'
|
||||||
|
}
|
||||||
|
|
||||||
|
def replace_english_words(text: str) -> str:
|
||||||
|
return ' '.join(DICTIONARY.get(word, word) for word in text.split())
|
||||||
|
|
||||||
|
def remove_ass_statements(text: str) -> str:
|
||||||
|
# Regular expression to match everything between curly braces (non-greedy)
|
||||||
|
return re.sub(r'\{.*?\}', '', text)
|
||||||
|
|
||||||
|
def remove_new_line_statements(text: str) -> str:
|
||||||
|
return text.replace("\\N", " ")
|
||||||
|
|
||||||
|
def split_sentence(sentence, n, translated_sentence):
|
||||||
|
original_commas = sentence.count(',')
|
||||||
|
translated_commas = translated_sentence.count(',')
|
||||||
|
|
||||||
|
# Split by commas if commas align
|
||||||
|
if original_commas != 0 and original_commas == translated_commas:
|
||||||
|
return [chunk.strip() for chunk in translated_sentence.split(',')]
|
||||||
|
|
||||||
|
# Otherwise, split into n roughly equal chunks
|
||||||
|
words = translated_sentence.split()
|
||||||
|
chunk_size = math.ceil(len(words) / n)
|
||||||
|
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
||||||
|
return chunks
|
15
utils/translator.py
Normal file
15
utils/translator.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
from transformers import AutoProcessor, SeamlessM4Tv2Model
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
|
||||||
|
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
|
||||||
|
|
||||||
|
def translator(subtitle_line: str, target_language: str) -> str:
|
||||||
|
# Ensure the model and processor are loaded to the GPU
|
||||||
|
model.to('cuda')
|
||||||
|
# Move input tensors to GPU
|
||||||
|
text_inputs = processor(text=subtitle_line, src_lang="eng", return_tensors="pt")
|
||||||
|
text_inputs = {key: value.to('cuda') for key, value in text_inputs.items()}
|
||||||
|
# Generate output tokens on GPU
|
||||||
|
output_tokens = model.generate(**text_inputs, tgt_lang=target_language, num_beams=5, generate_speech=False)
|
||||||
|
# Decode the result
|
||||||
|
return processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
Reference in New Issue
Block a user