hhalim commited on
Commit
ecadcdb
β€’
1 Parent(s): 249b095

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +200 -0
  3. requirements.txt +10 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: WikipediaAIDataScience
3
- emoji: 🐒
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.16.2
8
  app_file: app.py
 
1
  ---
2
+ title: WikipediaAIWithDataframeMemory
3
+ emoji: 🏒
4
+ colorFrom: gray
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.16.2
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import wikipediaapi
3
+ import wikipedia
4
+ from wikipedia.exceptions import DisambiguationError
5
+ from transformers import TFAutoModel, AutoTokenizer
6
+ import numpy as np
7
+ import pandas as pd
8
+ import faiss
9
+ import gradio as gr
10
+
11
+ try:
12
+ nlp = spacy.load("en_core_web_sm")
13
+ except:
14
+ spacy.cli.download("en_core_web_sm")
15
+ nlp = spacy.load("en_core_web_sm")
16
+
17
+ wh_words = ['what', 'who', 'how', 'when', 'which']
18
+ def get_concepts(text):
19
+ text = text.lower()
20
+ doc = nlp(text)
21
+ concepts = []
22
+ for chunk in doc.noun_chunks:
23
+ if chunk.text not in wh_words:
24
+ concepts.append(chunk.text)
25
+ return concepts
26
+
27
+ def get_passages(text, k=100):
28
+ doc = nlp(text)
29
+ passages = []
30
+ passage_len = 0
31
+ passage = ""
32
+ sents = list(doc.sents)
33
+ for i in range(len(sents)):
34
+ sen = sents[i]
35
+ passage_len+=len(sen)
36
+ if passage_len >= k:
37
+ passages.append(passage)
38
+ passage = sen.text
39
+ passage_len = len(sen)
40
+ continue
41
+
42
+ elif i==(len(sents)-1):
43
+ passage+=" "+sen.text
44
+ passages.append(passage)
45
+ passage = ""
46
+ passage_len = 0
47
+ continue
48
+
49
+ passage+=" "+sen.text
50
+ return passages
51
+
52
+ def get_dicts_for_dpr(concepts, n_results=20, k=100):
53
+ dicts = []
54
+ for concept in concepts:
55
+ wikis = wikipedia.search(concept, results=n_results)
56
+ print(concept, "No of Wikis: ",len(wikis))
57
+ for wiki in wikis:
58
+ try:
59
+ html_page = wikipedia.page(title = wiki, auto_suggest = False)
60
+ except DisambiguationError:
61
+ continue
62
+
63
+ htmlResults=html_page.content
64
+
65
+ passages = get_passages(htmlResults, k=k)
66
+ for passage in passages:
67
+ i_dicts = {}
68
+ i_dicts['text'] = passage
69
+ i_dicts['title'] = wiki
70
+ dicts.append(i_dicts)
71
+ return dicts
72
+
73
+ passage_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
74
+ query_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
75
+ p_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
76
+ q_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
77
+
78
+ def get_title_text_combined(passage_dicts):
79
+ res = []
80
+ for p in passage_dicts:
81
+ res.append(tuple((p['title'], p['text'])))
82
+ return res
83
+
84
+ def extracted_passage_embeddings(processed_passages, max_length=156):
85
+ passage_inputs = p_tokenizer.batch_encode_plus(
86
+ processed_passages,
87
+ add_special_tokens=True,
88
+ truncation=True,
89
+ padding="max_length",
90
+ max_length=max_length,
91
+ return_token_type_ids=True
92
+ )
93
+ passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']),
94
+ np.array(passage_inputs['attention_mask']),
95
+ np.array(passage_inputs['token_type_ids'])],
96
+ batch_size=64,
97
+ verbose=1)
98
+ return passage_embeddings
99
+
100
+ def extracted_query_embeddings(queries, max_length=64):
101
+ query_inputs = q_tokenizer.batch_encode_plus(
102
+ queries,
103
+ add_special_tokens=True,
104
+ truncation=True,
105
+ padding="max_length",
106
+ max_length=max_length,
107
+ return_token_type_ids=True
108
+ )
109
+ query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']),
110
+ np.array(query_inputs['attention_mask']),
111
+ np.array(query_inputs['token_type_ids'])],
112
+ batch_size=1,
113
+ verbose=1)
114
+ return query_embeddings
115
+
116
+ #Wikipedia API:
117
+
118
+ def get_pagetext(page):
119
+ s=str(page).replace("/t","")
120
+
121
+ return s
122
+
123
+ def get_wiki_summary(search):
124
+ wiki_wiki = wikipediaapi.Wikipedia('en')
125
+ page = wiki_wiki.page(search)
126
+
127
+ isExist = page.exists()
128
+ if not isExist:
129
+ return isExist, "Not found", "Not found", "Not found", "Not found"
130
+
131
+ pageurl = page.fullurl
132
+ pagetitle = page.title
133
+ pagesummary = page.summary[0:60]
134
+ pagetext = get_pagetext(page.text)
135
+
136
+ backlinks = page.backlinks
137
+ linklist = ""
138
+ for link in backlinks.items():
139
+ pui = link[0]
140
+ linklist += pui + " , "
141
+ a=1
142
+
143
+ categories = page.categories
144
+ categorylist = ""
145
+ for category in categories.items():
146
+ pui = category[0]
147
+ categorylist += pui + " , "
148
+ a=1
149
+
150
+ links = page.links
151
+ linklist2 = ""
152
+ for link in links.items():
153
+ pui = link[0]
154
+ linklist2 += pui + " , "
155
+ a=1
156
+
157
+ sections = page.sections
158
+
159
+ ex_dic = {
160
+ 'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
161
+ 'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
162
+ }
163
+
164
+ df = pd.DataFrame(ex_dic)
165
+
166
+ return df
167
+
168
+ def search(question):
169
+ concepts = get_concepts(question)
170
+ print("concepts: ",concepts)
171
+ dicts = get_dicts_for_dpr(concepts, n_results=1)
172
+ lendicts = len(dicts)
173
+ print("dicts len: ", lendicts)
174
+ if lendicts == 0:
175
+ return pd.DataFrame()
176
+ processed_passages = get_title_text_combined(dicts)
177
+ passage_embeddings = extracted_passage_embeddings(processed_passages)
178
+ query_embeddings = extracted_query_embeddings([question])
179
+ faiss_index = faiss.IndexFlatL2(128)
180
+ faiss_index.add(passage_embeddings.pooler_output)
181
+ prob, index = faiss_index.search(query_embeddings.pooler_output, k=lendicts)
182
+ return pd.DataFrame([dicts[i] for i in index[0]])
183
+
184
+ # AI UI SOTA - Gradio blocks with UI formatting, and event driven UI
185
+ with gr.Blocks() as demo: # Block documentation on event listeners, start here: https://gradio.app/blocks_and_event_listeners/
186
+ gr.Markdown("<h1><center>🍰 Ultimate Wikipedia AI 🎨</center></h1>")
187
+ gr.Markdown("""<div align="center">Search and Find Anything Then Use in AI! <a href="https://www.mediawiki.org/wiki/API:Main_page">MediaWiki - API for Wikipedia</a>. <a href="https://paperswithcode.com/datasets?q=wikipedia&v=lst&o=newest">Papers,Code,Datasets for SOTA w/ Wikipedia</a>""")
188
+ with gr.Row(): # inputs and buttons
189
+ inp = gr.Textbox(lines=1, default="Syd Mead", label="Question")
190
+ with gr.Row(): # inputs and buttons
191
+ b3 = gr.Button("Search AI Summaries")
192
+ b4 = gr.Button("Search Web Live")
193
+ with gr.Row(): # outputs DF1
194
+ out = gr.Dataframe(label="Answers", type="pandas")
195
+ with gr.Row(): # output DF2
196
+ out_DF = gr.Dataframe(wrap=True, max_rows=1000, overflow_row_behaviour= "paginate", datatype = ["markdown", "markdown"], headers=['Entity', 'Value'])
197
+ inp.submit(fn=get_wiki_summary, inputs=inp, outputs=out_DF)
198
+ b3.click(fn=search, inputs=inp, outputs=out)
199
+ b4.click(fn=get_wiki_summary, inputs=inp, outputs=out_DF )
200
+ demo.launch(debug=True, show_error=True)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ wikipedia
2
+ spacy
3
+ faiss-cpu
4
+ pandas
5
+ transformers
6
+ tensorflow
7
+ wikipedia-api
8
+ beautifulsoup4
9
+ gradio
10
+ requests