cpnepo commited on
Commit
481fca7
1 Parent(s): 4d91906

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from transformers import (AutoModelForQuestionAnswering,
4
+ AutoTokenizer, pipeline)
5
+
6
+ import pandas as pd
7
+ import regex as re
8
+ from urllib import request
9
+
10
+ # Select model for question answering
11
+ model_name = "deepset/roberta-base-squad2"
12
+
13
+ # Load model & tokenizer
14
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name)
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+
17
+ # Create pipeline
18
+ pipe = pipeline('question-answering', model=model_name, tokenizer=model_name)
19
+
20
+ # Load Harry Potter book corpus from link
21
+ url = ("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt")
22
+ response = request.urlopen(url)
23
+ book1_raw_0 = response.read().decode('utf8')
24
+
25
+ # Text pre-processing
26
+ # Remove page statements
27
+ book1_raw_1 = re.sub(r'Page \| [0-9]+ Harry Potter [a-zA-Z \-]+J.K. Rowling',
28
+ '', book1_raw_0)
29
+
30
+ # Remove newlines
31
+ book1_raw_1 = re.sub(r'\n', '', book1_raw_1)
32
+
33
+ # Remove periods; this will relevant in the regrouping later
34
+ book1_raw_1 = re.sub(r'Mr. ', 'Mr ', book1_raw_1)
35
+ book1_raw_1 = re.sub(r'Ms. ', 'Ms ', book1_raw_1)
36
+ book1_raw_1 = re.sub(r'Mrs. ', 'Mrs ', book1_raw_1)
37
+
38
+ # Group into 3 sentences-long parts
39
+ paragraphs = re.findall("[^.?!]+[.?!][^.?!]+[.?!][^.?!]+[.?!]", book1_raw_1)
40
+
41
+ # Type in HP-related query here
42
+ query = st.text_area("Hello muggle! What is your question?")
43
+
44
+ # Perform sentence embedding on query and sentence groups
45
+ model_embed_name = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
46
+
47
+ model_embed = SentenceTransformer(model_embed_name)
48
+ doc_emb = model_embed.encode(paragraphs)
49
+ query_emb = model_embed.encode(query)
50
+
51
+ #Compute dot score between query and all document embeddings
52
+ scores = util.cos_sim(query_emb, doc_emb)[0].cpu().tolist()
53
+
54
+ #Combine docs & scores
55
+ doc_score_pairs = list(zip(paragraphs, scores))
56
+
57
+ #Sort by decreasing score and get only 3 most similar groups
58
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1],
59
+ reverse=True)[:3]
60
+
61
+ # Join these similar groups to form the context
62
+ context = "".join(x[0] for x in doc_score_pairs)
63
+
64
+ # Perform the querying
65
+ QA_input = {'question': query, 'context': context}
66
+ out = pipe(QA_input)
67
+ st.json(out)