Spaces:

SoumyaJ
/

machine_translation

Running

App Files Files Community

SoumyaJ commited on Jun 26, 2024

Commit

c732202

1 Parent(s): 550857a

Initial commit

Browse files

Files changed (22) hide show

.gitignore +4 -0
__pycache__/main.cpython-310.pyc +0 -0
app.py +39 -0
experiments/language_translation.ipynb +476 -0
logs/translation_error.log +2 -0
requirements.txt +4 -0
settings.json +3 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/main.cpython-310.pyc +0 -0
src/__pycache__/translation.cpython-310.pyc +0 -0
src/classmodels/__init__.py +0 -0
src/classmodels/__pycache__/__init__.cpython-310.pyc +0 -0
src/classmodels/__pycache__/translationinput.cpython-310.pyc +0 -0
src/classmodels/__pycache__/translationoutput.cpython-310.pyc +0 -0
src/classmodels/translationinput.py +6 -0
src/classmodels/translationoutput.py +7 -0
src/errorlog/__init__.py +0 -0
src/errorlog/__pycache__/__init__.cpython-310.pyc +0 -0
src/errorlog/__pycache__/errorlog.cpython-310.pyc +0 -0
src/errorlog/errorlog.py +27 -0
src/translation.py +47 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+venv\
+experiments\
+logs\
+__pycache__\

__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (1.23 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from fastapi import FastAPI
+from pathlib import Path
+import sys
+# Get the absolute path to the 'src' directory
+src_path = Path(__file__).resolve().parent / 'src'
+# Add 'src' directory to the Python path (sys.path)
+sys.path.append(str(src_path))
+from src.classmodels.translationinput import TranslationInput
+from src.classmodels.translationoutput import TranslationOutput
+from fastapi.middleware.cors import CORSMiddleware
+from src.translation import translate_text
+app = FastAPI()
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+@app.post("/cmsai/translate", response_model=TranslationOutput)
+async def translate(input: TranslationInput):
+    try:
+        output = translate_text(input.text_to_translate, input.target_language)
+        if output is not None:
+            return TranslationOutput(status_code=200, translated_text=output)
+        else:
+            return TranslationOutput(status_code=400, message="target language is not supported")
+    except Exception as e:
+        return TranslationOutput(status_code=500, message=str(e))
+#if __name__ == "__main__":
+    #translate(TranslationInput(text_to_translate="Sample",target_language="zh-Cn"))

experiments/language_translation.ipynb ADDED Viewed

	@@ -0,0 +1,476 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check if the settings file with the languages available and able to be loaded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pathlib import Path\n",
+    "\n",
+    "def isSettingsFileAvailable():\n",
+    "    current_dir = Path.cwd()\n",
+    "    file_path =current_dir.parent /'settings.json'\n",
+    "    try:\n",
+    "        if file_path.exists() and file_path.is_file():\n",
+    "            with file_path.open('r') as file:\n",
+    "                settings = json.load(file)\n",
+    "                return settings\n",
+    "        else:\n",
+    "            return \"Settings file is not found\"\n",
+    "    except Exception as err:\n",
+    "        return \"Issue reading the settings file\"\n",
+    "    finally:\n",
+    "        if \"file\" in locals() and not file.closed:\n",
+    "            file.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If the settings file is present ---> validate the ISO code passed to API is a valid one "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'language_supported': ['en', 'zh-CN', 'zh-TW', 'ms', 'ja', 'kr']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "value = isSettingsFileAvailable()\n",
+    "print(value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Provide a logging mechanism to handle any errors during the translation process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import json\n",
+    "\n",
+    "# Configure logging\n",
+    "logging.basicConfig(level=logging.ERROR,\n",
+    "                    format='%(asctime)s %(levelname)s %(message)s',\n",
+    "                    handlers=[\n",
+    "                        logging.FileHandler(\"../logs/translation_error.log\"),\n",
+    "                        logging.StreamHandler()\n",
+    "                    ])\n",
+    "\n",
+    "logger = logging.getLogger()\n",
+    "\n",
+    "def log_error(error_message):\n",
+    "    try:\n",
+    "        log_entry = {\n",
+    "            \"error_message\": error_message\n",
+    "        }\n",
+    "        logger.error(json.dumps(log_entry))\n",
+    "    except json.JSONDecodeError as json_err:\n",
+    "        logger.error(f\"Failed to serialize error message as JSON: {error_message}\")\n",
+    "        logger.error(f\"JSON serialization error details: {json_err}\")\n",
+    "    except Exception as ex:\n",
+    "        logger.error(f\"An error occurred while logging: {str(ex)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check if the target language is within the translation list, if yes can proceed with that "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def isTargetLanguageSupported(target_langcode):\n",
+    "    try:\n",
+    "        settings_config = isSettingsFileAvailable()\n",
+    "        language_config = settings_config.get('language_supported','')\n",
+    "        if language_config and target_langcode.lower() in language_config:\n",
+    "            return True\n",
+    "        else:\n",
+    "         log_error(f\"Language ---{target_langcode}--- provided is not supported as per settings\")\n",
+    "         return False   \n",
+    "    except Exception as ex:\n",
+    "       log_error(str(ex))\n",
+    "       return False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-06-25 12:13:45,428 ERROR {\"error_message\": \"Language ---zh-CN--- provided is not supported as per settings\"}\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(isTargetLanguageSupported('zh-CN'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After this basic check ups, lets start with the actual translation process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -q deep_translator "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from deep_translator import GoogleTranslator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def translate_text_usingGoogleTranslator(text, language):\n",
+    "    try:\n",
+    "        isLanguageSupported = isTargetLanguageSupported(language)\n",
+    "        if isLanguageSupported:\n",
+    "            translated_text = GoogleTranslator(source='auto', target=language).translate(text)\n",
+    "            return translated_text\n",
+    "        else:\n",
+    "            return False\n",
+    "    except Exception as ex:\n",
+    "        log_error(str(ex))\n",
+    "        return False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-06-25 12:14:23,295 ERROR {\"error_message\": \"Language ---zh-CN--- provided is not supported as per settings\"}\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(translate_text_usingGoogleTranslator('Machine learning.','zh-CN'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Calculate the BLEU score - THIs WILL BE CALCULATED BETWEEN TRANSLATED TEXT and a REFERENCE TEXT(GENERATED BY MS translator)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Step 1- Populate the reference text which is from MS translator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#rc1 is the release candidate version from the google translate \n",
+    "\n",
+    "%pip install -q googletrans==4.0.0-rc1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once the source language is there, use MS mymemory provider to populate the reference text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from translate import Translator\n",
+    "\n",
+    "def translate_text_usingMyMemory(text, from_lang, to_lang):\n",
+    "    translator = Translator(provider='mymemory', from_lang= from_lang, to_lang=to_lang)\n",
+    "    return translator.translate(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'我很好'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "translate_text_usingMyMemory('i am good','en', 'zh') "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Auto-detect the language ---- IF NEEDED"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language: ceb\n"
+     ]
+    }
+   ],
+   "source": [
+    "from googletrans import Translator\n",
+    "\n",
+    "def detect_language_with_googletrans(text):\n",
+    "    translator = Translator()\n",
+    "    detection = translator.detect(text)\n",
+    "    return detection.lang\n",
+    "\n",
+    "# Example usage\n",
+    "text = \"naunsa ka dili man ko maayo\"\n",
+    "detected_language = detect_language_with_googletrans(text)\n",
+    "print(f\"Detected language: {detected_language}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform metrics evaluation on how well the translation is used.. Will use BLEU score for that"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#nltk - Natural language toolkit is the library to process for different words\n",
+    "#jieba - used for tokenization in Chinese language ONLY as the concept of tokenization works a bit different there \n",
+    "%pip install -q nltk jieba"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "BLEU score calculation for Chinese words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import jieba\n",
+    "from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n",
+    "\n",
+    "def calculate_bleu_score_usingjieba(reference_text, candidate_text):\n",
+    "    # Use jieba to tokenize the sentences\n",
+    "    reference_tokens = list(jieba.cut(reference_text))\n",
+    "    candidate_tokens = list(jieba.cut(candidate_text))\n",
+    "\n",
+    "    # Wrap the reference tokens in a nested list\n",
+    "    reference = [reference_tokens]\n",
+    "    candidate = candidate_tokens\n",
+    "\n",
+    "    # Calculate BLEU score with smoothing\n",
+    "    bleu_score = sentence_bleu(reference, candidate, smoothing_function=SmoothingFunction().method6)\n",
+    "    print(bleu_score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Building prefix dict from the default dictionary ...\n",
+      "2024-06-25 09:36:09,429 DEBUG Building prefix dict from the default dictionary ...\n",
+      "Loading model from cache C:\\Users\\soumya\\AppData\\Local\\Temp\\jieba.cache\n",
+      "2024-06-25 09:36:09,558 DEBUG Loading model from cache C:\\Users\\soumya\\AppData\\Local\\Temp\\jieba.cache\n",
+      "Loading model cost 0.820 seconds.\n",
+      "2024-06-25 09:36:10,361 DEBUG Loading model cost 0.820 seconds.\n",
+      "Prefix dict has been built successfully.\n",
+      "2024-06-25 09:36:10,362 DEBUG Prefix dict has been built successfully.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "calculate_bleu_score_usingjieba('我很好','我很好')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Calculate BLEU score for other languages such as english, malay etc. \n",
+    "Tokenizer used here can be word net tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "def calculate_bleu_score_usingnltk(reference_text, candidate_text):\n",
+    "    reference_tokens = word_tokenize(reference_text.lower())\n",
+    "    candidate_tokens = word_tokenize(candidate_text.lower())\n",
+    "\n",
+    "    print(reference_tokens)\n",
+    "    print(candidate_tokens)\n",
+    "\n",
+    "    # Calculate BLEU score with smoothing\n",
+    "    bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method2)\n",
+    "    print(bleu_score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['saya', 'baik']\n",
+      "['saya', 'baik']\n",
+      "0.7071067811865476\n"
+     ]
+    }
+   ],
+   "source": [
+    "calculate_bleu_score_usingnltk(\"saya baik\",'saya baik')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Questions: \n",
+    "\n",
+    "1) I have configured the supported languages in settings file ? \n",
+    "2) The request will be based on text/per language ?"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

logs/translation_error.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2024-06-25 14:59:56,262 ERROR Language provided is not supported as per settings
2	+ 2024-06-25 15:03:47,430 ERROR Language ---jp--- provided is not supported as per settings

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastapi
+uvicorn
+pydantic
+deep_translator

settings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "language_supported": ["en", "zh-CN","zh-TW","ms","ja","kr"]
+}

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (164 Bytes). View file

src/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

src/__pycache__/translation.cpython-310.pyc ADDED Viewed

Binary file (1.84 kB). View file

src/classmodels/__init__.py ADDED Viewed

File without changes

src/classmodels/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (176 Bytes). View file

src/classmodels/__pycache__/translationinput.cpython-310.pyc ADDED Viewed

Binary file (598 Bytes). View file

src/classmodels/__pycache__/translationoutput.cpython-310.pyc ADDED Viewed

Binary file (731 Bytes). View file

src/classmodels/translationinput.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pydantic import BaseModel, Field
+class TranslationInput(BaseModel):
+    text_to_translate : str = Field(..., description="Text to be translated")
+    target_language : str = Field(..., description="Target language for translation")

src/classmodels/translationoutput.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from pydantic import BaseModel, Field
+from typing import Optional
+class TranslationOutput(BaseModel):
+    translated_text: Optional[str] = Field(None, description="The final text which has been translated to output")
+    status_code: int = Field(description="Status code")
+    message : Optional[str] = Field(None, description="track any exception message received")

src/errorlog/__init__.py ADDED Viewed

File without changes

src/errorlog/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (173 Bytes). View file

src/errorlog/__pycache__/errorlog.cpython-310.pyc ADDED Viewed

Binary file (913 Bytes). View file

src/errorlog/errorlog.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import logging
+import json
+from logging.handlers import TimedRotatingFileHandler
+from datetime import datetime
+from pathlib import Path
+parent_path = Path(__file__).resolve().parent.parent.parent
+final_path = parent_path/'logs'/'translation_error.log'
+# Configure logging with TimedRotatingFileHandler
+logging.basicConfig(level=logging.ERROR,
+                    format='%(asctime)s %(levelname)s %(message)s')
+# Create a TimedRotatingFileHandler
+handler = TimedRotatingFileHandler(filename=final_path, when='W0', interval=1, backupCount=0, encoding='utf-8')
+# Set the log file name format (optional)
+handler.suffix = "%Y-%m-%d_%H-%M-%S.log"
+# Set the logging format
+handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
+logger = logging.getLogger().addHandler(handler)
+def log_error(error_message):
+   logging.error(error_message)

src/translation.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import json
+from pathlib import Path
+from errorlog.errorlog import log_error
+from deep_translator import GoogleTranslator
+def isSettingsFileAvailable():
+    current_dir = Path(__file__).resolve().parent
+    file_path = current_dir.parent / 'settings.json'
+    try:
+        if file_path.exists() and file_path.is_file():
+            with file_path.open('r') as file:
+                settings = json.load(file)
+                return settings
+        else:
+            return "Settings file is not found"
+    except Exception as err:
+        return "Issue reading the settings file"
+    finally:
+        if "file" in locals() and not file.closed:
+            file.close()
+def isTargetLanguageSupported(target_langcode):
+    try:
+        settings_config = isSettingsFileAvailable()
+        language_config = settings_config.get('language_supported','')
+        if language_config and target_langcode.lower() in language_config:
+            return True
+        else:
+         log_error(f"Language ---{target_langcode}--- provided is not supported as per settings")
+         return False
+    except Exception as ex:
+       log_error(str(ex))
+       return False
+def translate_text(text, language):
+    try:
+        isLanguageSupported = isTargetLanguageSupported(language)
+        if isLanguageSupported:
+            translated_text = GoogleTranslator(source='auto', target=language).translate(text)
+            return translated_text
+        else:
+            return f"Language ---{language}--- provided is not supported as per settings"
+    except Exception as ex:
+        log_error(str(ex))
+        return "Error processing the request"