yewsam1277
commited on
Commit
•
adf0286
1
Parent(s):
fe3d83d
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Untitled1 (2).ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1W44vqcumLa_CtuLGpbS8dEk4WtCFUr-z
|
8 |
+
"""
|
9 |
+
|
10 |
+
# Commented out IPython magic to ensure Python compatibility.
|
11 |
+
# %%bash
|
12 |
+
#
|
13 |
+
# pip install --upgrade pip
|
14 |
+
# pip install farm-haystack[colab]
|
15 |
+
|
16 |
+
# Commented out IPython magic to ensure Python compatibility.
|
17 |
+
# %%bash
|
18 |
+
#
|
19 |
+
# pip install malaya
|
20 |
+
|
21 |
+
"""Step 2"""
|
22 |
+
|
23 |
+
from haystack.telemetry import tutorial_running
|
24 |
+
|
25 |
+
tutorial_running(1)
|
26 |
+
import logging
|
27 |
+
|
28 |
+
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
|
29 |
+
logging.getLogger("haystack").setLevel(logging.INFO)
|
30 |
+
|
31 |
+
from haystack.nodes import PreProcessor
|
32 |
+
from haystack.utils import convert_files_to_docs
|
33 |
+
|
34 |
+
all_docs = convert_files_to_docs(dir_path='/content/drive/MyDrive/data/malaysia/')
|
35 |
+
preprocessor = PreProcessor(
|
36 |
+
clean_empty_lines=True,
|
37 |
+
clean_whitespace=True,
|
38 |
+
clean_header_footer=False,
|
39 |
+
split_by="word",
|
40 |
+
split_length=100,
|
41 |
+
split_respect_sentence_boundary=True,
|
42 |
+
)
|
43 |
+
docs = preprocessor.process(all_docs)
|
44 |
+
|
45 |
+
print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")
|
46 |
+
|
47 |
+
from haystack.document_stores import InMemoryDocumentStore
|
48 |
+
|
49 |
+
document_store = InMemoryDocumentStore(use_bm25=True)
|
50 |
+
|
51 |
+
import os
|
52 |
+
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
|
53 |
+
|
54 |
+
files_to_index = ['/content/drive/MyDrive/data/malaysia' + "/" + f for f in os.listdir('/content/drive/MyDrive/data/malaysia')]
|
55 |
+
indexing_pipeline = TextIndexingPipeline(document_store)
|
56 |
+
indexing_pipeline.run_batch(file_paths=files_to_index)
|
57 |
+
|
58 |
+
from haystack.nodes import FARMReader
|
59 |
+
from haystack.utils import fetch_archive_from_http
|
60 |
+
|
61 |
+
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
|
62 |
+
data_dir = "data/squad20"
|
63 |
+
# data_dir = "PATH/TO_YOUR/TRAIN_DATA"
|
64 |
+
reader.train(data_dir='/content/drive/MyDrive/data/malaysia', train_filename='ms-train-2.0.json', use_gpu=True, n_epochs=1, save_dir="MyCustomReader")
|
65 |
+
|
66 |
+
reader.save(directory="/content/drive/MyDrive/data/malaysia/MyCustomReader")
|
67 |
+
|
68 |
+
from haystack.nodes import BM25Retriever
|
69 |
+
|
70 |
+
retriever = BM25Retriever(document_store=document_store)
|
71 |
+
|
72 |
+
from haystack.nodes import TransformersReader
|
73 |
+
|
74 |
+
new_reader = FARMReader(model_name_or_path="/content/drive/MyDrive/data/malaysia/MyCustomReader", use_gpu=True)
|
75 |
+
#reader = TransformersReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
|
76 |
+
|
77 |
+
from haystack.pipelines import ExtractiveQAPipeline
|
78 |
+
|
79 |
+
pipe = ExtractiveQAPipeline(new_reader, retriever)
|
80 |
+
|
81 |
+
prediction = pipe.run(
|
82 |
+
query="siapakah najib razak",
|
83 |
+
params={
|
84 |
+
"Retriever": {"top_k": 10},
|
85 |
+
"Reader": {"top_k": 5}
|
86 |
+
}
|
87 |
+
)
|
88 |
+
|
89 |
+
prediction['answers']
|
90 |
+
|
91 |
+
from getpass import getpass
|
92 |
+
|
93 |
+
model_api_key = getpass("Enter model provider API key:")
|
94 |
+
|
95 |
+
import requests
|
96 |
+
|
97 |
+
API_URL = "https://api-inference.huggingface.co/models/yewsam1277/question-answering-bahasa-malaysia"
|
98 |
+
headers = {"Authorization": "Bearer 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"}
|
99 |
+
|
100 |
+
def query(payload):
|
101 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
102 |
+
return response.json()
|
103 |
+
|
104 |
+
output = query({
|
105 |
+
"inputs": {
|
106 |
+
"question": "What's my name?",
|
107 |
+
"context": "My name is Clara and I live in Berkeley."
|
108 |
+
},
|
109 |
+
})
|
110 |
+
|
111 |
+
print(output)
|
112 |
+
|
113 |
+
from haystack.nodes import PromptNode
|
114 |
+
|
115 |
+
model_name = "yewsam1277/question-answering-bahasa-malaysia"
|
116 |
+
prompt_node = PromptNode(model_name, api_key=model_api_key, max_length=256)
|
117 |
+
|
118 |
+
from haystack.agents.memory import ConversationSummaryMemory
|
119 |
+
|
120 |
+
summary_memory = ConversationSummaryMemory(prompt_node)
|
121 |
+
|
122 |
+
from haystack.agents.conversational import ConversationalAgent
|
123 |
+
|
124 |
+
conversational_agent = ConversationalAgent(prompt_node=prompt_node, memory=summary_memory)
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
# Commented out IPython magic to ensure Python compatibility.
|
129 |
+
# %%bash
|
130 |
+
#
|
131 |
+
# pip install wikipedia-api
|
132 |
+
|
133 |
+
"""Step 1"""
|
134 |
+
|
135 |
+
import wikipediaapi
|
136 |
+
|
137 |
+
wiki = wikipediaapi.Wikipedia('ms')
|
138 |
+
page = wiki.page('Malaysia')
|
139 |
+
|
140 |
+
pages = {'Malaysia': page}
|
141 |
+
|
142 |
+
pages.update(page.links)
|
143 |
+
|
144 |
+
len(pages)
|
145 |
+
|
146 |
+
from google.colab import drive
|
147 |
+
drive.mount('/content/drive')
|
148 |
+
|
149 |
+
done = 0
|
150 |
+
for key in pages:
|
151 |
+
try:
|
152 |
+
with open(f'/content/drive/MyDrive/data/malaysia/{pages[key].title}.txt', 'w') as f:
|
153 |
+
get_text = pages[key].text
|
154 |
+
get_text = get_text.lower().replace('\n', ' ')
|
155 |
+
f.write(get_text)
|
156 |
+
except Exception as e:
|
157 |
+
pass
|
158 |
+
done += 1
|
159 |
+
print(f"Written: {pages[key].title}\t(done {done})", end='\r')
|
160 |
+
|
161 |
+
len(pages)
|
162 |
+
|
163 |
+
"""Training"""
|
164 |
+
|