AutoTranslate/utils/preprocessing.py

import re
import math


# List of words which might lead to inaccurate translation
DICTIONARY = {
    'violated': 'raped',
    'semen':    'cum'
}

def replace_english_words(text: str) -> str:
    return ' '.join(DICTIONARY.get(word, word) for word in text.split())

def remove_ass_statements(text: str) -> str:
    # Regular expression to match everything between curly braces (non-greedy)
    return re.sub(r'\{.*?\}', '', text)

def remove_new_line_statements(text: str) -> str:
    return text.replace("\\N", " ")

def split_sentence(sentence, n, translated_sentence):
    original_commas = sentence.count(',')
    translated_commas = translated_sentence.count(',')

    # Split by commas if commas align
    if original_commas != 0 and original_commas == translated_commas:
        return [chunk.strip() for chunk in translated_sentence.split(',')]

    # Otherwise, split into n roughly equal chunks
    words = translated_sentence.split()
    chunk_size = math.ceil(len(words) / n)
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks