Spaces:
Running
Running
Initial commit
Browse files- .gitignore +4 -0
- __pycache__/main.cpython-310.pyc +0 -0
- app.py +39 -0
- experiments/language_translation.ipynb +476 -0
- logs/translation_error.log +2 -0
- requirements.txt +4 -0
- settings.json +3 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/__pycache__/main.cpython-310.pyc +0 -0
- src/__pycache__/translation.cpython-310.pyc +0 -0
- src/classmodels/__init__.py +0 -0
- src/classmodels/__pycache__/__init__.cpython-310.pyc +0 -0
- src/classmodels/__pycache__/translationinput.cpython-310.pyc +0 -0
- src/classmodels/__pycache__/translationoutput.cpython-310.pyc +0 -0
- src/classmodels/translationinput.py +6 -0
- src/classmodels/translationoutput.py +7 -0
- src/errorlog/__init__.py +0 -0
- src/errorlog/__pycache__/__init__.cpython-310.pyc +0 -0
- src/errorlog/__pycache__/errorlog.cpython-310.pyc +0 -0
- src/errorlog/errorlog.py +27 -0
- src/translation.py +47 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
venv\
|
2 |
+
experiments\
|
3 |
+
logs\
|
4 |
+
__pycache__\
|
__pycache__/main.cpython-310.pyc
ADDED
Binary file (1.23 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from pathlib import Path
|
3 |
+
import sys
|
4 |
+
|
5 |
+
# Get the absolute path to the 'src' directory
|
6 |
+
src_path = Path(__file__).resolve().parent / 'src'
|
7 |
+
|
8 |
+
# Add 'src' directory to the Python path (sys.path)
|
9 |
+
sys.path.append(str(src_path))
|
10 |
+
|
11 |
+
from src.classmodels.translationinput import TranslationInput
|
12 |
+
from src.classmodels.translationoutput import TranslationOutput
|
13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
14 |
+
from src.translation import translate_text
|
15 |
+
|
16 |
+
app = FastAPI()
|
17 |
+
|
18 |
+
origins = ["*"]
|
19 |
+
|
20 |
+
app.add_middleware(
|
21 |
+
CORSMiddleware,
|
22 |
+
allow_origins=origins,
|
23 |
+
allow_credentials=True,
|
24 |
+
allow_methods=["*"],
|
25 |
+
allow_headers=["*"]
|
26 |
+
)
|
27 |
+
@app.post("/cmsai/translate", response_model=TranslationOutput)
|
28 |
+
async def translate(input: TranslationInput):
|
29 |
+
try:
|
30 |
+
output = translate_text(input.text_to_translate, input.target_language)
|
31 |
+
if output is not None:
|
32 |
+
return TranslationOutput(status_code=200, translated_text=output)
|
33 |
+
else:
|
34 |
+
return TranslationOutput(status_code=400, message="target language is not supported")
|
35 |
+
except Exception as e:
|
36 |
+
return TranslationOutput(status_code=500, message=str(e))
|
37 |
+
|
38 |
+
#if __name__ == "__main__":
|
39 |
+
#translate(TranslationInput(text_to_translate="Sample",target_language="zh-Cn"))
|
experiments/language_translation.ipynb
ADDED
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"Check if the settings file with the languages available and able to be loaded"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 2,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"import json\n",
|
17 |
+
"from pathlib import Path\n",
|
18 |
+
"\n",
|
19 |
+
"def isSettingsFileAvailable():\n",
|
20 |
+
" current_dir = Path.cwd()\n",
|
21 |
+
" file_path =current_dir.parent /'settings.json'\n",
|
22 |
+
" try:\n",
|
23 |
+
" if file_path.exists() and file_path.is_file():\n",
|
24 |
+
" with file_path.open('r') as file:\n",
|
25 |
+
" settings = json.load(file)\n",
|
26 |
+
" return settings\n",
|
27 |
+
" else:\n",
|
28 |
+
" return \"Settings file is not found\"\n",
|
29 |
+
" except Exception as err:\n",
|
30 |
+
" return \"Issue reading the settings file\"\n",
|
31 |
+
" finally:\n",
|
32 |
+
" if \"file\" in locals() and not file.closed:\n",
|
33 |
+
" file.close()"
|
34 |
+
]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"cell_type": "markdown",
|
38 |
+
"metadata": {},
|
39 |
+
"source": [
|
40 |
+
"If the settings file is present ---> validate the ISO code passed to API is a valid one "
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"cell_type": "code",
|
45 |
+
"execution_count": 3,
|
46 |
+
"metadata": {},
|
47 |
+
"outputs": [
|
48 |
+
{
|
49 |
+
"name": "stdout",
|
50 |
+
"output_type": "stream",
|
51 |
+
"text": [
|
52 |
+
"{'language_supported': ['en', 'zh-CN', 'zh-TW', 'ms', 'ja', 'kr']}\n"
|
53 |
+
]
|
54 |
+
}
|
55 |
+
],
|
56 |
+
"source": [
|
57 |
+
"value = isSettingsFileAvailable()\n",
|
58 |
+
"print(value)"
|
59 |
+
]
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"cell_type": "markdown",
|
63 |
+
"metadata": {},
|
64 |
+
"source": [
|
65 |
+
"Provide a logging mechanism to handle any errors during the translation process"
|
66 |
+
]
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"cell_type": "code",
|
70 |
+
"execution_count": 4,
|
71 |
+
"metadata": {},
|
72 |
+
"outputs": [],
|
73 |
+
"source": [
|
74 |
+
"import logging\n",
|
75 |
+
"import json\n",
|
76 |
+
"\n",
|
77 |
+
"# Configure logging\n",
|
78 |
+
"logging.basicConfig(level=logging.ERROR,\n",
|
79 |
+
" format='%(asctime)s %(levelname)s %(message)s',\n",
|
80 |
+
" handlers=[\n",
|
81 |
+
" logging.FileHandler(\"../logs/translation_error.log\"),\n",
|
82 |
+
" logging.StreamHandler()\n",
|
83 |
+
" ])\n",
|
84 |
+
"\n",
|
85 |
+
"logger = logging.getLogger()\n",
|
86 |
+
"\n",
|
87 |
+
"def log_error(error_message):\n",
|
88 |
+
" try:\n",
|
89 |
+
" log_entry = {\n",
|
90 |
+
" \"error_message\": error_message\n",
|
91 |
+
" }\n",
|
92 |
+
" logger.error(json.dumps(log_entry))\n",
|
93 |
+
" except json.JSONDecodeError as json_err:\n",
|
94 |
+
" logger.error(f\"Failed to serialize error message as JSON: {error_message}\")\n",
|
95 |
+
" logger.error(f\"JSON serialization error details: {json_err}\")\n",
|
96 |
+
" except Exception as ex:\n",
|
97 |
+
" logger.error(f\"An error occurred while logging: {str(ex)}\")"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "markdown",
|
102 |
+
"metadata": {},
|
103 |
+
"source": [
|
104 |
+
"Check if the target language is within the translation list, if yes can proceed with that "
|
105 |
+
]
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"cell_type": "code",
|
109 |
+
"execution_count": 7,
|
110 |
+
"metadata": {},
|
111 |
+
"outputs": [],
|
112 |
+
"source": [
|
113 |
+
"def isTargetLanguageSupported(target_langcode):\n",
|
114 |
+
" try:\n",
|
115 |
+
" settings_config = isSettingsFileAvailable()\n",
|
116 |
+
" language_config = settings_config.get('language_supported','')\n",
|
117 |
+
" if language_config and target_langcode.lower() in language_config:\n",
|
118 |
+
" return True\n",
|
119 |
+
" else:\n",
|
120 |
+
" log_error(f\"Language ---{target_langcode}--- provided is not supported as per settings\")\n",
|
121 |
+
" return False \n",
|
122 |
+
" except Exception as ex:\n",
|
123 |
+
" log_error(str(ex))\n",
|
124 |
+
" return False"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"cell_type": "code",
|
129 |
+
"execution_count": 8,
|
130 |
+
"metadata": {},
|
131 |
+
"outputs": [
|
132 |
+
{
|
133 |
+
"name": "stderr",
|
134 |
+
"output_type": "stream",
|
135 |
+
"text": [
|
136 |
+
"2024-06-25 12:13:45,428 ERROR {\"error_message\": \"Language ---zh-CN--- provided is not supported as per settings\"}\n"
|
137 |
+
]
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"name": "stdout",
|
141 |
+
"output_type": "stream",
|
142 |
+
"text": [
|
143 |
+
"False\n"
|
144 |
+
]
|
145 |
+
}
|
146 |
+
],
|
147 |
+
"source": [
|
148 |
+
"print(isTargetLanguageSupported('zh-CN'))"
|
149 |
+
]
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"cell_type": "markdown",
|
153 |
+
"metadata": {},
|
154 |
+
"source": [
|
155 |
+
"After this basic check ups, lets start with the actual translation process"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": 6,
|
161 |
+
"metadata": {},
|
162 |
+
"outputs": [],
|
163 |
+
"source": [
|
164 |
+
"%pip install -q deep_translator "
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": 9,
|
170 |
+
"metadata": {},
|
171 |
+
"outputs": [],
|
172 |
+
"source": [
|
173 |
+
"from deep_translator import GoogleTranslator"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": 10,
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [],
|
181 |
+
"source": [
|
182 |
+
"def translate_text_usingGoogleTranslator(text, language):\n",
|
183 |
+
" try:\n",
|
184 |
+
" isLanguageSupported = isTargetLanguageSupported(language)\n",
|
185 |
+
" if isLanguageSupported:\n",
|
186 |
+
" translated_text = GoogleTranslator(source='auto', target=language).translate(text)\n",
|
187 |
+
" return translated_text\n",
|
188 |
+
" else:\n",
|
189 |
+
" return False\n",
|
190 |
+
" except Exception as ex:\n",
|
191 |
+
" log_error(str(ex))\n",
|
192 |
+
" return False"
|
193 |
+
]
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"cell_type": "code",
|
197 |
+
"execution_count": 12,
|
198 |
+
"metadata": {},
|
199 |
+
"outputs": [
|
200 |
+
{
|
201 |
+
"name": "stderr",
|
202 |
+
"output_type": "stream",
|
203 |
+
"text": [
|
204 |
+
"2024-06-25 12:14:23,295 ERROR {\"error_message\": \"Language ---zh-CN--- provided is not supported as per settings\"}\n"
|
205 |
+
]
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"name": "stdout",
|
209 |
+
"output_type": "stream",
|
210 |
+
"text": [
|
211 |
+
"False\n"
|
212 |
+
]
|
213 |
+
}
|
214 |
+
],
|
215 |
+
"source": [
|
216 |
+
"print(translate_text_usingGoogleTranslator('Machine learning.','zh-CN'))"
|
217 |
+
]
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"cell_type": "markdown",
|
221 |
+
"metadata": {},
|
222 |
+
"source": [
|
223 |
+
"Calculate the BLEU score - THIs WILL BE CALCULATED BETWEEN TRANSLATED TEXT and a REFERENCE TEXT(GENERATED BY MS translator)"
|
224 |
+
]
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"cell_type": "markdown",
|
228 |
+
"metadata": {},
|
229 |
+
"source": [
|
230 |
+
"Step 1- Populate the reference text which is from MS translator"
|
231 |
+
]
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"cell_type": "code",
|
235 |
+
"execution_count": 10,
|
236 |
+
"metadata": {},
|
237 |
+
"outputs": [],
|
238 |
+
"source": [
|
239 |
+
"#rc1 is the release candidate version from the google translate \n",
|
240 |
+
"\n",
|
241 |
+
"%pip install -q googletrans==4.0.0-rc1"
|
242 |
+
]
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"cell_type": "markdown",
|
246 |
+
"metadata": {},
|
247 |
+
"source": [
|
248 |
+
"Once the source language is there, use MS mymemory provider to populate the reference text"
|
249 |
+
]
|
250 |
+
},
|
251 |
+
{
|
252 |
+
"cell_type": "code",
|
253 |
+
"execution_count": 11,
|
254 |
+
"metadata": {},
|
255 |
+
"outputs": [],
|
256 |
+
"source": [
|
257 |
+
"from translate import Translator\n",
|
258 |
+
"\n",
|
259 |
+
"def translate_text_usingMyMemory(text, from_lang, to_lang):\n",
|
260 |
+
" translator = Translator(provider='mymemory', from_lang= from_lang, to_lang=to_lang)\n",
|
261 |
+
" return translator.translate(text)"
|
262 |
+
]
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"cell_type": "code",
|
266 |
+
"execution_count": 12,
|
267 |
+
"metadata": {},
|
268 |
+
"outputs": [
|
269 |
+
{
|
270 |
+
"data": {
|
271 |
+
"text/plain": [
|
272 |
+
"'我很好'"
|
273 |
+
]
|
274 |
+
},
|
275 |
+
"execution_count": 12,
|
276 |
+
"metadata": {},
|
277 |
+
"output_type": "execute_result"
|
278 |
+
}
|
279 |
+
],
|
280 |
+
"source": [
|
281 |
+
"translate_text_usingMyMemory('i am good','en', 'zh') "
|
282 |
+
]
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"cell_type": "markdown",
|
286 |
+
"metadata": {},
|
287 |
+
"source": [
|
288 |
+
"Auto-detect the language ---- IF NEEDED"
|
289 |
+
]
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"cell_type": "code",
|
293 |
+
"execution_count": 8,
|
294 |
+
"metadata": {},
|
295 |
+
"outputs": [
|
296 |
+
{
|
297 |
+
"name": "stdout",
|
298 |
+
"output_type": "stream",
|
299 |
+
"text": [
|
300 |
+
"Detected language: ceb\n"
|
301 |
+
]
|
302 |
+
}
|
303 |
+
],
|
304 |
+
"source": [
|
305 |
+
"from googletrans import Translator\n",
|
306 |
+
"\n",
|
307 |
+
"def detect_language_with_googletrans(text):\n",
|
308 |
+
" translator = Translator()\n",
|
309 |
+
" detection = translator.detect(text)\n",
|
310 |
+
" return detection.lang\n",
|
311 |
+
"\n",
|
312 |
+
"# Example usage\n",
|
313 |
+
"text = \"naunsa ka dili man ko maayo\"\n",
|
314 |
+
"detected_language = detect_language_with_googletrans(text)\n",
|
315 |
+
"print(f\"Detected language: {detected_language}\")"
|
316 |
+
]
|
317 |
+
},
|
318 |
+
{
|
319 |
+
"cell_type": "markdown",
|
320 |
+
"metadata": {},
|
321 |
+
"source": [
|
322 |
+
"Perform metrics evaluation on how well the translation is used.. Will use BLEU score for that"
|
323 |
+
]
|
324 |
+
},
|
325 |
+
{
|
326 |
+
"cell_type": "code",
|
327 |
+
"execution_count": 93,
|
328 |
+
"metadata": {},
|
329 |
+
"outputs": [],
|
330 |
+
"source": [
|
331 |
+
"#nltk - Natural language toolkit is the library to process for different words\n",
|
332 |
+
"#jieba - used for tokenization in Chinese language ONLY as the concept of tokenization works a bit different there \n",
|
333 |
+
"%pip install -q nltk jieba"
|
334 |
+
]
|
335 |
+
},
|
336 |
+
{
|
337 |
+
"cell_type": "markdown",
|
338 |
+
"metadata": {},
|
339 |
+
"source": [
|
340 |
+
"BLEU score calculation for Chinese words"
|
341 |
+
]
|
342 |
+
},
|
343 |
+
{
|
344 |
+
"cell_type": "code",
|
345 |
+
"execution_count": 13,
|
346 |
+
"metadata": {},
|
347 |
+
"outputs": [],
|
348 |
+
"source": [
|
349 |
+
"import jieba\n",
|
350 |
+
"from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n",
|
351 |
+
"\n",
|
352 |
+
"def calculate_bleu_score_usingjieba(reference_text, candidate_text):\n",
|
353 |
+
" # Use jieba to tokenize the sentences\n",
|
354 |
+
" reference_tokens = list(jieba.cut(reference_text))\n",
|
355 |
+
" candidate_tokens = list(jieba.cut(candidate_text))\n",
|
356 |
+
"\n",
|
357 |
+
" # Wrap the reference tokens in a nested list\n",
|
358 |
+
" reference = [reference_tokens]\n",
|
359 |
+
" candidate = candidate_tokens\n",
|
360 |
+
"\n",
|
361 |
+
" # Calculate BLEU score with smoothing\n",
|
362 |
+
" bleu_score = sentence_bleu(reference, candidate, smoothing_function=SmoothingFunction().method6)\n",
|
363 |
+
" print(bleu_score)"
|
364 |
+
]
|
365 |
+
},
|
366 |
+
{
|
367 |
+
"cell_type": "code",
|
368 |
+
"execution_count": 14,
|
369 |
+
"metadata": {},
|
370 |
+
"outputs": [
|
371 |
+
{
|
372 |
+
"name": "stderr",
|
373 |
+
"output_type": "stream",
|
374 |
+
"text": [
|
375 |
+
"Building prefix dict from the default dictionary ...\n",
|
376 |
+
"2024-06-25 09:36:09,429 DEBUG Building prefix dict from the default dictionary ...\n",
|
377 |
+
"Loading model from cache C:\\Users\\soumya\\AppData\\Local\\Temp\\jieba.cache\n",
|
378 |
+
"2024-06-25 09:36:09,558 DEBUG Loading model from cache C:\\Users\\soumya\\AppData\\Local\\Temp\\jieba.cache\n",
|
379 |
+
"Loading model cost 0.820 seconds.\n",
|
380 |
+
"2024-06-25 09:36:10,361 DEBUG Loading model cost 0.820 seconds.\n",
|
381 |
+
"Prefix dict has been built successfully.\n",
|
382 |
+
"2024-06-25 09:36:10,362 DEBUG Prefix dict has been built successfully.\n"
|
383 |
+
]
|
384 |
+
},
|
385 |
+
{
|
386 |
+
"name": "stdout",
|
387 |
+
"output_type": "stream",
|
388 |
+
"text": [
|
389 |
+
"1.0\n"
|
390 |
+
]
|
391 |
+
}
|
392 |
+
],
|
393 |
+
"source": [
|
394 |
+
"calculate_bleu_score_usingjieba('我很好','我很好')"
|
395 |
+
]
|
396 |
+
},
|
397 |
+
{
|
398 |
+
"cell_type": "markdown",
|
399 |
+
"metadata": {},
|
400 |
+
"source": [
|
401 |
+
"Calculate BLEU score for other languages such as english, malay etc. \n",
|
402 |
+
"Tokenizer used here can be word net tokenizer"
|
403 |
+
]
|
404 |
+
},
|
405 |
+
{
|
406 |
+
"cell_type": "code",
|
407 |
+
"execution_count": 15,
|
408 |
+
"metadata": {},
|
409 |
+
"outputs": [],
|
410 |
+
"source": [
|
411 |
+
"from nltk.tokenize import word_tokenize\n",
|
412 |
+
"\n",
|
413 |
+
"def calculate_bleu_score_usingnltk(reference_text, candidate_text):\n",
|
414 |
+
" reference_tokens = word_tokenize(reference_text.lower())\n",
|
415 |
+
" candidate_tokens = word_tokenize(candidate_text.lower())\n",
|
416 |
+
"\n",
|
417 |
+
" print(reference_tokens)\n",
|
418 |
+
" print(candidate_tokens)\n",
|
419 |
+
"\n",
|
420 |
+
" # Calculate BLEU score with smoothing\n",
|
421 |
+
" bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method2)\n",
|
422 |
+
" print(bleu_score)"
|
423 |
+
]
|
424 |
+
},
|
425 |
+
{
|
426 |
+
"cell_type": "code",
|
427 |
+
"execution_count": 16,
|
428 |
+
"metadata": {},
|
429 |
+
"outputs": [
|
430 |
+
{
|
431 |
+
"name": "stdout",
|
432 |
+
"output_type": "stream",
|
433 |
+
"text": [
|
434 |
+
"['saya', 'baik']\n",
|
435 |
+
"['saya', 'baik']\n",
|
436 |
+
"0.7071067811865476\n"
|
437 |
+
]
|
438 |
+
}
|
439 |
+
],
|
440 |
+
"source": [
|
441 |
+
"calculate_bleu_score_usingnltk(\"saya baik\",'saya baik')"
|
442 |
+
]
|
443 |
+
},
|
444 |
+
{
|
445 |
+
"cell_type": "markdown",
|
446 |
+
"metadata": {},
|
447 |
+
"source": [
|
448 |
+
"Questions: \n",
|
449 |
+
"\n",
|
450 |
+
"1) I have configured the supported languages in settings file ? \n",
|
451 |
+
"2) The request will be based on text/per language ?"
|
452 |
+
]
|
453 |
+
}
|
454 |
+
],
|
455 |
+
"metadata": {
|
456 |
+
"kernelspec": {
|
457 |
+
"display_name": ".venv",
|
458 |
+
"language": "python",
|
459 |
+
"name": "python3"
|
460 |
+
},
|
461 |
+
"language_info": {
|
462 |
+
"codemirror_mode": {
|
463 |
+
"name": "ipython",
|
464 |
+
"version": 3
|
465 |
+
},
|
466 |
+
"file_extension": ".py",
|
467 |
+
"mimetype": "text/x-python",
|
468 |
+
"name": "python",
|
469 |
+
"nbconvert_exporter": "python",
|
470 |
+
"pygments_lexer": "ipython3",
|
471 |
+
"version": "3.10.0"
|
472 |
+
}
|
473 |
+
},
|
474 |
+
"nbformat": 4,
|
475 |
+
"nbformat_minor": 2
|
476 |
+
}
|
logs/translation_error.log
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
2024-06-25 14:59:56,262 ERROR Language provided is not supported as per settings
|
2 |
+
2024-06-25 15:03:47,430 ERROR Language ---jp--- provided is not supported as per settings
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
pydantic
|
4 |
+
deep_translator
|
settings.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"language_supported": ["en", "zh-CN","zh-TW","ms","ja","kr"]
|
3 |
+
}
|
src/__init__.py
ADDED
File without changes
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (164 Bytes). View file
|
|
src/__pycache__/main.cpython-310.pyc
ADDED
Binary file (1.07 kB). View file
|
|
src/__pycache__/translation.cpython-310.pyc
ADDED
Binary file (1.84 kB). View file
|
|
src/classmodels/__init__.py
ADDED
File without changes
|
src/classmodels/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (176 Bytes). View file
|
|
src/classmodels/__pycache__/translationinput.cpython-310.pyc
ADDED
Binary file (598 Bytes). View file
|
|
src/classmodels/__pycache__/translationoutput.cpython-310.pyc
ADDED
Binary file (731 Bytes). View file
|
|
src/classmodels/translationinput.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
|
3 |
+
class TranslationInput(BaseModel):
|
4 |
+
text_to_translate : str = Field(..., description="Text to be translated")
|
5 |
+
target_language : str = Field(..., description="Target language for translation")
|
6 |
+
|
src/classmodels/translationoutput.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
class TranslationOutput(BaseModel):
|
5 |
+
translated_text: Optional[str] = Field(None, description="The final text which has been translated to output")
|
6 |
+
status_code: int = Field(description="Status code")
|
7 |
+
message : Optional[str] = Field(None, description="track any exception message received")
|
src/errorlog/__init__.py
ADDED
File without changes
|
src/errorlog/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (173 Bytes). View file
|
|
src/errorlog/__pycache__/errorlog.cpython-310.pyc
ADDED
Binary file (913 Bytes). View file
|
|
src/errorlog/errorlog.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import json
|
3 |
+
from logging.handlers import TimedRotatingFileHandler
|
4 |
+
from datetime import datetime
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
|
8 |
+
parent_path = Path(__file__).resolve().parent.parent.parent
|
9 |
+
final_path = parent_path/'logs'/'translation_error.log'
|
10 |
+
|
11 |
+
# Configure logging with TimedRotatingFileHandler
|
12 |
+
logging.basicConfig(level=logging.ERROR,
|
13 |
+
format='%(asctime)s %(levelname)s %(message)s')
|
14 |
+
|
15 |
+
# Create a TimedRotatingFileHandler
|
16 |
+
handler = TimedRotatingFileHandler(filename=final_path, when='W0', interval=1, backupCount=0, encoding='utf-8')
|
17 |
+
|
18 |
+
# Set the log file name format (optional)
|
19 |
+
handler.suffix = "%Y-%m-%d_%H-%M-%S.log"
|
20 |
+
|
21 |
+
# Set the logging format
|
22 |
+
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
|
23 |
+
|
24 |
+
logger = logging.getLogger().addHandler(handler)
|
25 |
+
|
26 |
+
def log_error(error_message):
|
27 |
+
logging.error(error_message)
|
src/translation.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
from errorlog.errorlog import log_error
|
4 |
+
from deep_translator import GoogleTranslator
|
5 |
+
|
6 |
+
def isSettingsFileAvailable():
|
7 |
+
current_dir = Path(__file__).resolve().parent
|
8 |
+
file_path = current_dir.parent / 'settings.json'
|
9 |
+
|
10 |
+
try:
|
11 |
+
if file_path.exists() and file_path.is_file():
|
12 |
+
with file_path.open('r') as file:
|
13 |
+
settings = json.load(file)
|
14 |
+
return settings
|
15 |
+
else:
|
16 |
+
return "Settings file is not found"
|
17 |
+
except Exception as err:
|
18 |
+
return "Issue reading the settings file"
|
19 |
+
finally:
|
20 |
+
if "file" in locals() and not file.closed:
|
21 |
+
file.close()
|
22 |
+
|
23 |
+
def isTargetLanguageSupported(target_langcode):
|
24 |
+
try:
|
25 |
+
settings_config = isSettingsFileAvailable()
|
26 |
+
language_config = settings_config.get('language_supported','')
|
27 |
+
if language_config and target_langcode.lower() in language_config:
|
28 |
+
return True
|
29 |
+
else:
|
30 |
+
log_error(f"Language ---{target_langcode}--- provided is not supported as per settings")
|
31 |
+
return False
|
32 |
+
except Exception as ex:
|
33 |
+
log_error(str(ex))
|
34 |
+
return False
|
35 |
+
|
36 |
+
def translate_text(text, language):
|
37 |
+
try:
|
38 |
+
isLanguageSupported = isTargetLanguageSupported(language)
|
39 |
+
if isLanguageSupported:
|
40 |
+
translated_text = GoogleTranslator(source='auto', target=language).translate(text)
|
41 |
+
return translated_text
|
42 |
+
else:
|
43 |
+
return f"Language ---{language}--- provided is not supported as per settings"
|
44 |
+
except Exception as ex:
|
45 |
+
log_error(str(ex))
|
46 |
+
return "Error processing the request"
|
47 |
+
|