Spaces:
Sleeping
Sleeping
Delete main2.py
Browse files
main2.py
DELETED
@@ -1,102 +0,0 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
-
Created on Wed Jan 4 05:56:28 2023
|
4 |
-
|
5 |
-
@author: dreji18
|
6 |
-
"""
|
7 |
-
|
8 |
-
# loading the packages
|
9 |
-
from rake_nltk import Rake
|
10 |
-
import wikipedia
|
11 |
-
from rank_bm25 import BM25Okapi
|
12 |
-
import torch
|
13 |
-
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
|
14 |
-
from fastapi import FastAPI
|
15 |
-
app = FastAPI()
|
16 |
-
|
17 |
-
@app.get("/")
|
18 |
-
def read_root():
|
19 |
-
return {"Hello": "World"}
|
20 |
-
|
21 |
-
# keyword extraction function
|
22 |
-
def keyword_extractor(query):
|
23 |
-
"""
|
24 |
-
Rake has some features:
|
25 |
-
1. convert automatically to lower case
|
26 |
-
2. extract important key phrases
|
27 |
-
3. it will extract combine words also (eg. Deep Learning, Capital City)
|
28 |
-
"""
|
29 |
-
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
|
30 |
-
r.extract_keywords_from_text(query)
|
31 |
-
keywords = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
|
32 |
-
return keywords
|
33 |
-
|
34 |
-
# data collection using wikepedia
|
35 |
-
def data_collection(search_words):
|
36 |
-
"""wikipedia"""
|
37 |
-
search_query = ' '.join(search_words)
|
38 |
-
wiki_pages = wikipedia.search(search_query, results = 5)
|
39 |
-
|
40 |
-
information_list = []
|
41 |
-
pages_list = []
|
42 |
-
for i in wiki_pages:
|
43 |
-
try:
|
44 |
-
info = wikipedia.summary(i)
|
45 |
-
if any(word in info.lower() for word in search_words):
|
46 |
-
information_list.append(info)
|
47 |
-
pages_list.append(i)
|
48 |
-
except:
|
49 |
-
pass
|
50 |
-
|
51 |
-
original_info = information_list
|
52 |
-
information_list = [item[:1000] for item in information_list] # limiting the word len to 512
|
53 |
-
|
54 |
-
return information_list, pages_list, original_info
|
55 |
-
|
56 |
-
# document ranking function
|
57 |
-
def document_ranking(documents, query, n):
|
58 |
-
"""BM25"""
|
59 |
-
try:
|
60 |
-
tokenized_corpus = [doc.split(" ") for doc in documents]
|
61 |
-
bm25 = BM25Okapi(tokenized_corpus)
|
62 |
-
tokenized_query = query.split(" ")
|
63 |
-
doc_scores = bm25.get_scores(tokenized_query)
|
64 |
-
datastore = bm25.get_top_n(tokenized_query, documents, n)
|
65 |
-
except:
|
66 |
-
pass
|
67 |
-
return datastore
|
68 |
-
|
69 |
-
def qna(context, question):
|
70 |
-
"""DistilBert"""
|
71 |
-
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_token_type_ids = True)
|
72 |
-
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', return_dict=False)
|
73 |
-
encoding = tokenizer.encode_plus(question, context)
|
74 |
-
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
|
75 |
-
start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
|
76 |
-
ans_tokens = input_ids[torch.argmax(start_scores) : torch.argmax(end_scores)+1]
|
77 |
-
answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens , skip_special_tokens=True)
|
78 |
-
answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
|
79 |
-
|
80 |
-
return answer_tokens_to_string
|
81 |
-
|
82 |
-
@app.get("/predict")
|
83 |
-
def answergen(search_string: str):
|
84 |
-
try:
|
85 |
-
keyword_list = keyword_extractor(search_string)
|
86 |
-
information, pages, original_data = data_collection(keyword_list)
|
87 |
-
datastore = document_ranking(information, search_string, 3)
|
88 |
-
|
89 |
-
answers_list = []
|
90 |
-
for i in range(len(datastore)):
|
91 |
-
result = qna(datastore[i], search_string)
|
92 |
-
answers_list.append(result)
|
93 |
-
|
94 |
-
return {"answer 1": answers_list[0],
|
95 |
-
"answer 2": answers_list[1],
|
96 |
-
"answer 3": answers_list[2]}
|
97 |
-
except:
|
98 |
-
return {"sorry couldn't process the request"}
|
99 |
-
|
100 |
-
#uvicorn app:app --port 8000 --reload
|
101 |
-
|
102 |
-
#%%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|