Spaces:

Rajendransp133
/

microservice-NMT

Sleeping

File size: 6,142 Bytes

ac901c7

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "f73dec47",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['இது ஒரு சோதனை வாக்கியம். ', 'இது மற்றொரு நீண்ட வித்தியாசமான சோதனை வாக்கியமாகும். ', '9876543210 என்ற எண்ணுக்கு ஒரு எஸ்எம்எஸ் அனுப்பவும், 2023 அக்டோபர் 15 ஆம் தேதிக்குள் newemail123@xyz.com என்ற மின்னஞ்சல் முகவரிக்கு அனுப்பவும். ']\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "import torch\n",
    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
    "\n",
    "ip = IndicProcessor(inference=True)\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"ai4bharat/indictrans2-en-indic-dist-200M\", trust_remote_code=True)\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(\"ai4bharat/indictrans2-en-indic-dist-200M\", trust_remote_code=True)\n",
    "\n",
    "sentences = [\n",
    "    \"This is a test sentence.\",\n",
    "    \"This is another longer different test sentence.\",\n",
    "    \"Please send an SMS to 9876543210 and an email on newemail123@xyz.com by 15th October, 2023.\",\n",
    "]\n",
    "\n",
    "batch = ip.preprocess_batch(sentences, src_lang=\"eng_Latn\", tgt_lang=\"tam_Taml\", visualize=False) # set it to visualize=True to print a progress bar\n",
    "batch = tokenizer(batch, padding=\"longest\", truncation=True, max_length=256, return_tensors=\"pt\")\n",
    "\n",
    "with torch.inference_mode():\n",
    "    outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)\n",
    "\n",
    "with tokenizer.as_target_tokenizer():\n",
    "    # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.\n",
    "    # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.\n",
    "    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
    "\n",
    "outputs = ip.postprocess_batch(outputs, lang=\"tam_Taml\")\n",
    "print(outputs)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ec49007",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fa9fc68",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import torch\n",
    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
    "\n",
    "# Add local IndicTransToolkit path if needed\n",
    "sys.path.append(os.path.abspath(\"libs/IndicTransToolkit\"))\n",
    "from IndicTransToolkit.processor import IndicProcessor\n",
    "\n",
    "# Load processor, tokenizer, and model\n",
    "ip = IndicProcessor(inference=True)\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"ai4bharat/indictrans2-en-indic-dist-200M\", trust_remote_code=True)\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(\"ai4bharat/indictrans2-en-indic-dist-200M\", trust_remote_code=True)\n",
    "\n",
    "def translate(text, target_lang):\n",
    "    if not text.strip():\n",
    "        return \"Please enter some text.\"\n",
    "\n",
    "    # Preprocess\n",
    "    batch = ip.preprocess_batch([text], src_lang=\"eng_Latn\", tgt_lang=target_lang)\n",
    "    batch = tokenizer(batch, padding=\"longest\", truncation=True, max_length=256, return_tensors=\"pt\")\n",
    "\n",
    "    # Inference\n",
    "    with torch.inference_mode():\n",
    "        outputs = model.generate(**batch, num_beams=5, max_length=256)\n",
    "\n",
    "    # Postprocess\n",
    "    with tokenizer.as_target_tokenizer():\n",
    "        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
    "\n",
    "    return ip.postprocess_batch(decoded, lang=target_lang)[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "c4ae654a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'வணக்கம். '"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "translate_text(\"hello\",\"tam_Taml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "530f0925",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Translation: टाम् @टाम्ल नमस्कार, आप कैसे हैं? \n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "\n",
    "url = \"http://localhost:7860/translate\"\n",
    "\n",
    "payload = {\n",
    "    \"text\": \"Hello, how are you?\",\n",
    "    \"target_lang\": \"tam_Taml\"\n",
    "}\n",
    "\n",
    "headers = {\n",
    "    \"Content-Type\": \"application/json\"\n",
    "}\n",
    "\n",
    "response = requests.post(url, json=payload, headers=headers)\n",
    "\n",
    "if response.status_code == 200:\n",
    "    print(\"Translation:\", response.json()[\"translation\"])\n",
    "else:\n",
    "    print(\"Error:\", response.status_code, response.text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73eb9c61",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "indietrans2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}