yewsam1277's picture
Upload app.py
adf0286
# -*- coding: utf-8 -*-
"""Untitled1 (2).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1W44vqcumLa_CtuLGpbS8dEk4WtCFUr-z
"""
# Commented out IPython magic to ensure Python compatibility.
# %%bash
#
# pip install --upgrade pip
# pip install farm-haystack[colab]
# Commented out IPython magic to ensure Python compatibility.
# %%bash
#
# pip install malaya
"""Step 2"""
from haystack.telemetry import tutorial_running
tutorial_running(1)
import logging
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from haystack.nodes import PreProcessor
from haystack.utils import convert_files_to_docs
all_docs = convert_files_to_docs(dir_path='/content/drive/MyDrive/data/malaysia/')
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=False,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
)
docs = preprocessor.process(all_docs)
print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")
from haystack.document_stores import InMemoryDocumentStore
document_store = InMemoryDocumentStore(use_bm25=True)
import os
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
files_to_index = ['/content/drive/MyDrive/data/malaysia' + "/" + f for f in os.listdir('/content/drive/MyDrive/data/malaysia')]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)
from haystack.nodes import FARMReader
from haystack.utils import fetch_archive_from_http
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
data_dir = "data/squad20"
# data_dir = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir='/content/drive/MyDrive/data/malaysia', train_filename='ms-train-2.0.json', use_gpu=True, n_epochs=1, save_dir="MyCustomReader")
reader.save(directory="/content/drive/MyDrive/data/malaysia/MyCustomReader")
from haystack.nodes import BM25Retriever
retriever = BM25Retriever(document_store=document_store)
from haystack.nodes import TransformersReader
new_reader = FARMReader(model_name_or_path="/content/drive/MyDrive/data/malaysia/MyCustomReader", use_gpu=True)
#reader = TransformersReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
from haystack.pipelines import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(new_reader, retriever)
prediction = pipe.run(
query="siapakah najib razak",
params={
"Retriever": {"top_k": 10},
"Reader": {"top_k": 5}
}
)
prediction['answers']
from getpass import getpass
model_api_key = getpass("Enter model provider API key:")
import requests
API_URL = "https://api-inference.huggingface.co/models/yewsam1277/question-answering-bahasa-malaysia"
headers = {"Authorization": "Bearer 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"}
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
output = query({
"inputs": {
"question": "What's my name?",
"context": "My name is Clara and I live in Berkeley."
},
})
print(output)
from haystack.nodes import PromptNode
model_name = "yewsam1277/question-answering-bahasa-malaysia"
prompt_node = PromptNode(model_name, api_key=model_api_key, max_length=256)
from haystack.agents.memory import ConversationSummaryMemory
summary_memory = ConversationSummaryMemory(prompt_node)
from haystack.agents.conversational import ConversationalAgent
conversational_agent = ConversationalAgent(prompt_node=prompt_node, memory=summary_memory)
# Commented out IPython magic to ensure Python compatibility.
# %%bash
#
# pip install wikipedia-api
"""Step 1"""
import wikipediaapi
wiki = wikipediaapi.Wikipedia('ms')
page = wiki.page('Malaysia')
pages = {'Malaysia': page}
pages.update(page.links)
len(pages)
from google.colab import drive
drive.mount('/content/drive')
done = 0
for key in pages:
try:
with open(f'/content/drive/MyDrive/data/malaysia/{pages[key].title}.txt', 'w') as f:
get_text = pages[key].text
get_text = get_text.lower().replace('\n', ' ')
f.write(get_text)
except Exception as e:
pass
done += 1
print(f"Written: {pages[key].title}\t(done {done})", end='\r')
len(pages)
"""Training"""