Init
This commit is contained in:
33
utils/preprocessing.py
Normal file
33
utils/preprocessing.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import re
|
||||
import math
|
||||
|
||||
|
||||
# List of words which might lead to inaccurate translation
|
||||
DICTIONARY = {
|
||||
'violated': 'raped',
|
||||
'semen': 'cum'
|
||||
}
|
||||
|
||||
def replace_english_words(text: str) -> str:
|
||||
return ' '.join(DICTIONARY.get(word, word) for word in text.split())
|
||||
|
||||
def remove_ass_statements(text: str) -> str:
|
||||
# Regular expression to match everything between curly braces (non-greedy)
|
||||
return re.sub(r'\{.*?\}', '', text)
|
||||
|
||||
def remove_new_line_statements(text: str) -> str:
|
||||
return text.replace("\\N", " ")
|
||||
|
||||
def split_sentence(sentence, n, translated_sentence):
|
||||
original_commas = sentence.count(',')
|
||||
translated_commas = translated_sentence.count(',')
|
||||
|
||||
# Split by commas if commas align
|
||||
if original_commas != 0 and original_commas == translated_commas:
|
||||
return [chunk.strip() for chunk in translated_sentence.split(',')]
|
||||
|
||||
# Otherwise, split into n roughly equal chunks
|
||||
words = translated_sentence.split()
|
||||
chunk_size = math.ceil(len(words) / n)
|
||||
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
||||
return chunks
|
15
utils/translator.py
Normal file
15
utils/translator.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from transformers import AutoProcessor, SeamlessM4Tv2Model
|
||||
|
||||
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
|
||||
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
|
||||
|
||||
def translator(subtitle_line: str, target_language: str) -> str:
|
||||
# Ensure the model and processor are loaded to the GPU
|
||||
model.to('cuda')
|
||||
# Move input tensors to GPU
|
||||
text_inputs = processor(text=subtitle_line, src_lang="eng", return_tensors="pt")
|
||||
text_inputs = {key: value.to('cuda') for key, value in text_inputs.items()}
|
||||
# Generate output tokens on GPU
|
||||
output_tokens = model.generate(**text_inputs, tgt_lang=target_language, num_beams=5, generate_speech=False)
|
||||
# Decode the result
|
||||
return processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
Reference in New Issue
Block a user