yewsam1277 commited on
Commit
adf0286
1 Parent(s): fe3d83d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -0
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled1 (2).ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1W44vqcumLa_CtuLGpbS8dEk4WtCFUr-z
8
+ """
9
+
10
+ # Commented out IPython magic to ensure Python compatibility.
11
+ # %%bash
12
+ #
13
+ # pip install --upgrade pip
14
+ # pip install farm-haystack[colab]
15
+
16
+ # Commented out IPython magic to ensure Python compatibility.
17
+ # %%bash
18
+ #
19
+ # pip install malaya
20
+
21
+ """Step 2"""
22
+
23
+ from haystack.telemetry import tutorial_running
24
+
25
+ tutorial_running(1)
26
+ import logging
27
+
28
+ logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
29
+ logging.getLogger("haystack").setLevel(logging.INFO)
30
+
31
+ from haystack.nodes import PreProcessor
32
+ from haystack.utils import convert_files_to_docs
33
+
34
+ all_docs = convert_files_to_docs(dir_path='/content/drive/MyDrive/data/malaysia/')
35
+ preprocessor = PreProcessor(
36
+ clean_empty_lines=True,
37
+ clean_whitespace=True,
38
+ clean_header_footer=False,
39
+ split_by="word",
40
+ split_length=100,
41
+ split_respect_sentence_boundary=True,
42
+ )
43
+ docs = preprocessor.process(all_docs)
44
+
45
+ print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")
46
+
47
+ from haystack.document_stores import InMemoryDocumentStore
48
+
49
+ document_store = InMemoryDocumentStore(use_bm25=True)
50
+
51
+ import os
52
+ from haystack.pipelines.standard_pipelines import TextIndexingPipeline
53
+
54
+ files_to_index = ['/content/drive/MyDrive/data/malaysia' + "/" + f for f in os.listdir('/content/drive/MyDrive/data/malaysia')]
55
+ indexing_pipeline = TextIndexingPipeline(document_store)
56
+ indexing_pipeline.run_batch(file_paths=files_to_index)
57
+
58
+ from haystack.nodes import FARMReader
59
+ from haystack.utils import fetch_archive_from_http
60
+
61
+ reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
62
+ data_dir = "data/squad20"
63
+ # data_dir = "PATH/TO_YOUR/TRAIN_DATA"
64
+ reader.train(data_dir='/content/drive/MyDrive/data/malaysia', train_filename='ms-train-2.0.json', use_gpu=True, n_epochs=1, save_dir="MyCustomReader")
65
+
66
+ reader.save(directory="/content/drive/MyDrive/data/malaysia/MyCustomReader")
67
+
68
+ from haystack.nodes import BM25Retriever
69
+
70
+ retriever = BM25Retriever(document_store=document_store)
71
+
72
+ from haystack.nodes import TransformersReader
73
+
74
+ new_reader = FARMReader(model_name_or_path="/content/drive/MyDrive/data/malaysia/MyCustomReader", use_gpu=True)
75
+ #reader = TransformersReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
76
+
77
+ from haystack.pipelines import ExtractiveQAPipeline
78
+
79
+ pipe = ExtractiveQAPipeline(new_reader, retriever)
80
+
81
+ prediction = pipe.run(
82
+ query="siapakah najib razak",
83
+ params={
84
+ "Retriever": {"top_k": 10},
85
+ "Reader": {"top_k": 5}
86
+ }
87
+ )
88
+
89
+ prediction['answers']
90
+
91
+ from getpass import getpass
92
+
93
+ model_api_key = getpass("Enter model provider API key:")
94
+
95
+ import requests
96
+
97
+ API_URL = "https://api-inference.huggingface.co/models/yewsam1277/question-answering-bahasa-malaysia"
98
+ headers = {"Authorization": "Bearer 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"}
99
+
100
+ def query(payload):
101
+ response = requests.post(API_URL, headers=headers, json=payload)
102
+ return response.json()
103
+
104
+ output = query({
105
+ "inputs": {
106
+ "question": "What's my name?",
107
+ "context": "My name is Clara and I live in Berkeley."
108
+ },
109
+ })
110
+
111
+ print(output)
112
+
113
+ from haystack.nodes import PromptNode
114
+
115
+ model_name = "yewsam1277/question-answering-bahasa-malaysia"
116
+ prompt_node = PromptNode(model_name, api_key=model_api_key, max_length=256)
117
+
118
+ from haystack.agents.memory import ConversationSummaryMemory
119
+
120
+ summary_memory = ConversationSummaryMemory(prompt_node)
121
+
122
+ from haystack.agents.conversational import ConversationalAgent
123
+
124
+ conversational_agent = ConversationalAgent(prompt_node=prompt_node, memory=summary_memory)
125
+
126
+
127
+
128
+ # Commented out IPython magic to ensure Python compatibility.
129
+ # %%bash
130
+ #
131
+ # pip install wikipedia-api
132
+
133
+ """Step 1"""
134
+
135
+ import wikipediaapi
136
+
137
+ wiki = wikipediaapi.Wikipedia('ms')
138
+ page = wiki.page('Malaysia')
139
+
140
+ pages = {'Malaysia': page}
141
+
142
+ pages.update(page.links)
143
+
144
+ len(pages)
145
+
146
+ from google.colab import drive
147
+ drive.mount('/content/drive')
148
+
149
+ done = 0
150
+ for key in pages:
151
+ try:
152
+ with open(f'/content/drive/MyDrive/data/malaysia/{pages[key].title}.txt', 'w') as f:
153
+ get_text = pages[key].text
154
+ get_text = get_text.lower().replace('\n', ' ')
155
+ f.write(get_text)
156
+ except Exception as e:
157
+ pass
158
+ done += 1
159
+ print(f"Written: {pages[key].title}\t(done {done})", end='\r')
160
+
161
+ len(pages)
162
+
163
+ """Training"""
164
+