sengzi commited on
Commit
4e383ee
1 Parent(s): 41c45dd

Update Sejarah.py

Browse files
Files changed (1) hide show
  1. Sejarah.py +14 -25
Sejarah.py CHANGED
@@ -4,7 +4,7 @@ from haystack import Pipeline
4
  from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader
5
  from haystack.document_stores import InMemoryDocumentStore
6
  from haystack.utils import print_answers
7
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
 
9
  class Sejarah:
10
  def __init__(self):
@@ -40,32 +40,26 @@ class Sejarah:
40
  self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
41
  self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
42
 
43
- #Malay to English Model
44
- self.id_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")
45
- self.id_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-id-en")
46
-
47
- #English to Malay Model
48
- self.en_id_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")
49
- self.en_id_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-id")
50
-
51
 
52
  def language_converter(self, content, lang, method):
53
 
54
- content = content.lower()
55
-
56
  if lang == "en":
57
  if method == "question":
58
- tokenized_text = self.en_id_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
59
- translation = self.en_id_model.generate(**tokenized_text)
60
- content = self.en_id_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
61
 
 
 
62
  else:
63
- tokenized_text = self.id_en_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
64
- translation = self.id_en_model.generate(**tokenized_text)
65
- content = self.id_en_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
66
 
67
- return content
68
-
 
 
 
 
 
 
69
  def interface(self, question):
70
  language = self.detect_language(question)
71
 
@@ -82,9 +76,4 @@ class Sejarah:
82
  answer = self.language_converter(result['answers'][0].answer, language, "answer")
83
  context = self.language_converter(result['answers'][0].context, language, "answer")
84
 
85
- return answer, context
86
-
87
-
88
- def detect_language(self, content):
89
- lang = langid.classify(content)
90
- return lang[0]
 
4
  from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader
5
  from haystack.document_stores import InMemoryDocumentStore
6
  from haystack.utils import print_answers
7
+ from deep_translator import GoogleTranslator
8
 
9
  class Sejarah:
10
  def __init__(self):
 
40
  self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
41
  self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
42
 
 
 
 
 
 
 
 
 
43
 
44
  def language_converter(self, content, lang, method):
45
 
 
 
46
  if lang == "en":
47
  if method == "question":
48
+ new_content = GoogleTranslator(source='en', target='ms').translate(content)
 
 
49
 
50
+ if "when" in content:
51
+ new_content = new_content.replace("apabila","bila")
52
  else:
53
+ new_content = GoogleTranslator(source='ms', target='en').translate(content)
 
 
54
 
55
+ return new_content
56
+
57
+
58
+ def detect_language(self, content):
59
+ lang = langid.classify(content)
60
+ return lang[0]
61
+
62
+
63
  def interface(self, question):
64
  language = self.detect_language(question)
65
 
 
76
  answer = self.language_converter(result['answers'][0].answer, language, "answer")
77
  context = self.language_converter(result['answers'][0].context, language, "answer")
78
 
79
+ return answer, context