Init

2025-09-30 20:43:27 +02:00
commit 1acfc22112
8 changed files with 124 additions and 0 deletions
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -0,0 +1,33 @@
+import re
+import math
+
+
+# List of words which might lead to inaccurate translation
+DICTIONARY = {
+    'violated': 'raped',
+    'semen':    'cum'
+}
+
+def replace_english_words(text: str) -> str:
+    return ' '.join(DICTIONARY.get(word, word) for word in text.split())
+
+def remove_ass_statements(text: str) -> str:
+    # Regular expression to match everything between curly braces (non-greedy)
+    return re.sub(r'\{.*?\}', '', text)
+
+def remove_new_line_statements(text: str) -> str:
+    return text.replace("\\N", " ")
+
+def split_sentence(sentence, n, translated_sentence):
+    original_commas = sentence.count(',')
+    translated_commas = translated_sentence.count(',')
+
+    # Split by commas if commas align
+    if original_commas != 0 and original_commas == translated_commas:
+        return [chunk.strip() for chunk in translated_sentence.split(',')]
+    
+    # Otherwise, split into n roughly equal chunks
+    words = translated_sentence.split()
+    chunk_size = math.ceil(len(words) / n)
+    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    return chunks