Akbartus commited on
Commit
801b974
·
verified ·
1 Parent(s): a23ab3b

Delete main2.py

Browse files
Files changed (1) hide show
  1. main2.py +0 -102
main2.py DELETED
@@ -1,102 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Wed Jan 4 05:56:28 2023
4
-
5
- @author: dreji18
6
- """
7
-
8
- # loading the packages
9
- from rake_nltk import Rake
10
- import wikipedia
11
- from rank_bm25 import BM25Okapi
12
- import torch
13
- from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
14
- from fastapi import FastAPI
15
- app = FastAPI()
16
-
17
- @app.get("/")
18
- def read_root():
19
- return {"Hello": "World"}
20
-
21
- # keyword extraction function
22
- def keyword_extractor(query):
23
- """
24
- Rake has some features:
25
- 1. convert automatically to lower case
26
- 2. extract important key phrases
27
- 3. it will extract combine words also (eg. Deep Learning, Capital City)
28
- """
29
- r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
30
- r.extract_keywords_from_text(query)
31
- keywords = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
32
- return keywords
33
-
34
- # data collection using wikepedia
35
- def data_collection(search_words):
36
- """wikipedia"""
37
- search_query = ' '.join(search_words)
38
- wiki_pages = wikipedia.search(search_query, results = 5)
39
-
40
- information_list = []
41
- pages_list = []
42
- for i in wiki_pages:
43
- try:
44
- info = wikipedia.summary(i)
45
- if any(word in info.lower() for word in search_words):
46
- information_list.append(info)
47
- pages_list.append(i)
48
- except:
49
- pass
50
-
51
- original_info = information_list
52
- information_list = [item[:1000] for item in information_list] # limiting the word len to 512
53
-
54
- return information_list, pages_list, original_info
55
-
56
- # document ranking function
57
- def document_ranking(documents, query, n):
58
- """BM25"""
59
- try:
60
- tokenized_corpus = [doc.split(" ") for doc in documents]
61
- bm25 = BM25Okapi(tokenized_corpus)
62
- tokenized_query = query.split(" ")
63
- doc_scores = bm25.get_scores(tokenized_query)
64
- datastore = bm25.get_top_n(tokenized_query, documents, n)
65
- except:
66
- pass
67
- return datastore
68
-
69
- def qna(context, question):
70
- """DistilBert"""
71
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_token_type_ids = True)
72
- model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', return_dict=False)
73
- encoding = tokenizer.encode_plus(question, context)
74
- input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
75
- start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
76
- ans_tokens = input_ids[torch.argmax(start_scores) : torch.argmax(end_scores)+1]
77
- answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens , skip_special_tokens=True)
78
- answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
79
-
80
- return answer_tokens_to_string
81
-
82
- @app.get("/predict")
83
- def answergen(search_string: str):
84
- try:
85
- keyword_list = keyword_extractor(search_string)
86
- information, pages, original_data = data_collection(keyword_list)
87
- datastore = document_ranking(information, search_string, 3)
88
-
89
- answers_list = []
90
- for i in range(len(datastore)):
91
- result = qna(datastore[i], search_string)
92
- answers_list.append(result)
93
-
94
- return {"answer 1": answers_list[0],
95
- "answer 2": answers_list[1],
96
- "answer 3": answers_list[2]}
97
- except:
98
- return {"sorry couldn't process the request"}
99
-
100
- #uvicorn app:app --port 8000 --reload
101
-
102
- #%%