mikymatt commited on
Commit
98dc5b0
1 Parent(s): e1c4b9f

feat: release

Browse files
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from generateDistractors.senseToVec import S2V
2
+ from keyExtractor.rake import KeyExtractor
3
+ from questionGeneration.questionGeneration import QuestionGeneration
4
+ from summarizer.summarizer import Summarizer
5
+ import gradio as gr
6
+
7
+ sense2Vec = S2V()
8
+ Key = KeyExtractor()
9
+ Question = QuestionGeneration()
10
+ Summary = Summarizer()
11
+
12
+ def run(text):
13
+ result = []
14
+ summarized_text = Summary.summarizer(text)
15
+ imp_keywords = Key.get_keywords(text)
16
+
17
+ for answer in imp_keywords:
18
+ ques = Question.get_question(summarized_text,answer)
19
+ distractors = sense2Vec.execute(answer)
20
+ result.append({
21
+ "question": ques,
22
+ "answer": answer.capitalize(),
23
+ "distractors": distractors
24
+ })
25
+ return result
26
+
27
+ if __name__ == '__main__':
28
+ demo = gr.Interface(fn=run, inputs="text", outputs="json")
29
+ demo.launch()
generateDistractors/Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM python:3.9
3
+ WORKDIR /code
4
+ COPY ./requirements.txt /code/requirements.txt
5
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
6
+ COPY . /code
7
+ CMD ["uvicorn", "senseToVec:app", "--host", "0.0.0.0", "--port", "1237"]
generateDistractors/mmr.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import itertools
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+
6
+ #Maximal Marginal Relevance origin: https://maartengr.github.io/KeyBERT/api/mmr.html
7
+ def mmr(doc_embedding: np.ndarray,
8
+ word_embeddings: np.ndarray,
9
+ words: List[str],
10
+ top_n: int = 5,
11
+ diversity: float = 0.9) -> List[Tuple[str, float]]:
12
+ """ Calculate Maximal Marginal Relevance (MMR)
13
+ between candidate keywords and the document.
14
+
15
+
16
+ MMR considers the similarity of keywords/keyphrases with the
17
+ document, along with the similarity of already selected
18
+ keywords and keyphrases. This results in a selection of keywords
19
+ that maximize their within diversity with respect to the document.
20
+
21
+ Arguments:
22
+ doc_embedding: The document embeddings
23
+ word_embeddings: The embeddings of the selected candidate keywords/phrases
24
+ words: The selected candidate keywords/keyphrases
25
+ top_n: The number of keywords/keyhprases to return
26
+ diversity: How diverse the select keywords/keyphrases are.
27
+ Values between 0 and 1 with 0 being not diverse at all
28
+ and 1 being most diverse.
29
+
30
+ Returns:
31
+ List[Tuple[str, float]]: The selected keywords/keyphrases with their distances
32
+
33
+ """
34
+
35
+ # Extract similarity within words, and between words and the document
36
+ word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
37
+ word_similarity = cosine_similarity(word_embeddings)
38
+
39
+ # Initialize candidates and already choose best keyword/keyphras
40
+ keywords_idx = [np.argmax(word_doc_similarity)]
41
+ candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
42
+
43
+ for _ in range(top_n - 1):
44
+ # Extract similarities within candidates and
45
+ # between candidates and selected keywords/phrases
46
+ candidate_similarities = word_doc_similarity[candidates_idx, :]
47
+ target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
48
+
49
+ # Calculate MMR
50
+ mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
51
+ mmr_idx = candidates_idx[np.argmax(mmr)]
52
+
53
+ # Update keywords & candidates
54
+ keywords_idx.append(mmr_idx)
55
+ candidates_idx.remove(mmr_idx)
56
+
57
+ return [(words[idx], round(float(word_doc_similarity.reshape(1, -1)[0][idx]), 4)) for idx in keywords_idx]
generateDistractors/readme ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ !wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
2
+ !tar -xvf s2v_reddit_2015_md.tar.gz
generateDistractors/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sense2vec==2.0.1
2
+ sentence_transformers==2.2.2
3
+ pydantic
4
+ fastapi
5
+ uvicorn
generateDistractors/senseToVec.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sense2vec import Sense2Vec
2
+ from fastapi import FastAPI
3
+ from sentence_transformers import SentenceTransformer
4
+ import wget
5
+ import os
6
+ from .mmr import mmr
7
+
8
+ url = 'https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz'
9
+ cmd = 'tar -xvf {}'
10
+
11
+ class S2V:
12
+ def __init__(self):
13
+ self.model= SentenceTransformer('all-MiniLM-L12-v2')
14
+ filename = wget.download(url)
15
+ os.system(cmd.format(filename))
16
+ self.s2v = Sense2Vec().from_disk('s2v_old')
17
+
18
+ def removeDuplicates(self, most_similar, originalword):
19
+ distractors = []
20
+ #remove duplicates
21
+ for each_word in most_similar:
22
+ append_word = each_word[0].split("|")[0].replace("_", " ")
23
+ if append_word not in distractors and append_word != originalword:
24
+ distractors.append(append_word)
25
+ return distractors
26
+
27
+ def get_answer_and_distractor_embeddings(self,answer,candidate_distractors):
28
+ answer_embedding = self.model.encode([answer])
29
+ distractor_embeddings = self.model.encode(candidate_distractors)
30
+ return answer_embedding,distractor_embeddings
31
+
32
+ def execute(self, originalword):
33
+ word = originalword.lower()
34
+ word = word.replace(" ", "_")
35
+ # Find the best-matching sense for a given word based on the available senses and frequency counts.
36
+ sense = self.s2v.get_best_sense(word)
37
+ # Get the most similar entries in the table
38
+ most_similar = self.s2v.most_similar(sense, n=20)
39
+ #remove duplicates
40
+ distractors = self.removeDuplicates(most_similar, originalword)
41
+ distractors.insert(0,originalword)
42
+ # encode distractors and answer
43
+ answer_embedd, distractor_embedds = self.get_answer_and_distractor_embeddings(originalword,distractors)
44
+ #Maximal Marginal Relevance origin: https://maartengr.github.io/KeyBERT/api/mmr.html
45
+ final_distractors = mmr(answer_embedd,distractor_embedds,distractors,5)
46
+ filtered_distractors = []
47
+
48
+ for dist in final_distractors:
49
+ filtered_distractors.append(dist[0])
50
+
51
+ Answer = filtered_distractors[0]
52
+ Filtered_Distractors = filtered_distractors[1:]
53
+ return {
54
+ "answer": Answer,
55
+ "distractors": Filtered_Distractors
56
+ }
57
+
58
+ sense2Vec = S2V()
keyExtractor/.DS_Store ADDED
Binary file (6.15 kB). View file
 
keyExtractor/Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM python:3.9
3
+ WORKDIR /code
4
+ COPY ./requirements.txt /code/requirements.txt
5
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
6
+ COPY . /code
7
+ CMD ["uvicorn", "rake:app", "--host", "0.0.0.0", "--port", "1234"]
keyExtractor/rake.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rake_nltk import Rake
2
+ import nltk
3
+ nltk.download('stopwords')
4
+
5
+ # Uses stopwords for english from NLTK, and all puntuation characters by
6
+ # default
7
+ class KeyExtractor:
8
+ def __init__(self):
9
+ self.model = Rake()
10
+ def get_keywords(self, text):
11
+ # Extraction given the text.
12
+ self.model.extract_keywords_from_text(text)
13
+ # To get keyword phrases ranked highest to lowest.
14
+ imp_keywords = self.model.get_ranked_phrases()[0:4]
15
+
16
+ result = []
17
+ for answer in imp_keywords:
18
+ result.append(answer)
19
+ return result
keyExtractor/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pydantic
2
+ fastapi
3
+ uvicorn
4
+ rake-nltk
questionGeneration/Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM python:3.9
3
+ WORKDIR /code
4
+ COPY ./requirements.txt /code/requirements.txt
5
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
6
+ COPY . /code
7
+ CMD ["uvicorn", "questionGeneration:app", "--host", "0.0.0.0", "--port", "1236"]
questionGeneration/questionGeneration.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
3
+
4
+ class QuestionGeneration:
5
+ def __init__(self):
6
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
+ self.model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
8
+ self.tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
9
+ self.model = self.model.to(self.device)
10
+
11
+ def get_question(self, context, answer, model = None, tokenizer = None):
12
+ if(model == None):
13
+ model = self.model
14
+ if(tokenizer == None):
15
+ tokenizer = self.tokenizer
16
+ text = "context: {} answer: {}".format(context,answer)
17
+ encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(self.device)
18
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
19
+
20
+ outs = model.generate(input_ids=input_ids,
21
+ attention_mask=attention_mask,
22
+ early_stopping=True,
23
+ num_beams=5,
24
+ num_return_sequences=1,
25
+ no_repeat_ngram_size=2,
26
+ max_length=72
27
+ )
28
+
29
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
30
+
31
+
32
+ Question = dec[0].replace("question:","")
33
+ Question= Question.strip()
34
+ return Question
35
+
36
+ Question = QuestionGeneration()
questionGeneration/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ fastapi
3
+ pydantic
4
+ sentencepiece==0.1.95
5
+ transformers
6
+ uvicorn
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ rake-nltk
3
+ sense2vec==2.0.1
4
+ sentence_transformers==2.2.2
5
+ torch
6
+ sentencepiece==0.1.95
7
+ transformers
8
+ nltk
summarizer/Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM python:3.9
3
+ WORKDIR /code
4
+ COPY ./requirements.txt /code/requirements.txt
5
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
6
+ COPY . /code
7
+ CMD ["uvicorn", "summarizer:app", "--host", "0.0.0.0", "--port", "1235"]
summarizer/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ pydantic
3
+ fastapi
4
+ sentencepiece==0.1.95
5
+ transformers
6
+ nltk
7
+ uvicorn
summarizer/summarizer.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
3
+ import random
4
+ import numpy as np
5
+ import nltk
6
+
7
+ nltk.download('punkt')
8
+ nltk.download('brown')
9
+ nltk.download('wordnet')
10
+
11
+ from nltk.corpus import wordnet as wn
12
+ from nltk.tokenize import sent_tokenize
13
+
14
+ import locale
15
+ locale.getpreferredencoding = lambda: "UTF-8"
16
+
17
+ class Summarizer:
18
+ def __init__(self):
19
+ self.model = T5ForConditionalGeneration.from_pretrained('t5-base')
20
+ self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
21
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+ self.model = self.model.to(self.device)
23
+ self.set_seed(42)
24
+
25
+ def set_seed(self, seed: int):
26
+ random.seed(seed)
27
+ np.random.seed(seed)
28
+ torch.manual_seed(seed)
29
+ torch.cuda.manual_seed_all(seed)
30
+
31
+ def postprocesstext(self, content):
32
+ final=""
33
+ for sent in sent_tokenize(content):
34
+ sent = sent.capitalize()
35
+ final = final +" "+sent
36
+ return final
37
+
38
+
39
+ def summarizer(self, text, model = None, tokenizer = None):
40
+ if(model == None):
41
+ model = self.model
42
+ if(tokenizer == None):
43
+ tokenizer = self.tokenizer
44
+ text = text.strip().replace("\n"," ")
45
+ text = "summarize: "+text
46
+ max_len = 512
47
+ encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(self.device)
48
+
49
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
50
+
51
+ outs = model.generate(
52
+ input_ids=input_ids,
53
+ attention_mask=attention_mask,
54
+ early_stopping=True,
55
+ num_beams=3,
56
+ num_return_sequences=1,
57
+ no_repeat_ngram_size=2,
58
+ min_length = 75,
59
+ max_length=300
60
+ )
61
+
62
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
63
+ summary = dec[0]
64
+ summary = self.postprocesstext(summary)
65
+ summary= summary.strip()
66
+
67
+ return summary
68
+
69
+ Summary = Summarizer()
70
+
testers/bleu-4.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.translate.bleu_score import sentence_bleu
2
+
3
+ reference = [
4
+ 'this is a dog'.split(),
5
+ ]
6
+
7
+ candidate = 'this is dog'.split()
8
+
9
+ # quanto la frase candidata è vicina a quelle di riferimento
10
+ print('Individual 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
11
+ print('Individual 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)))
12
+ print('Individual 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 1, 0)))
13
+ print('Individual 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)))
14
+
15
+ print('average 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))
testers/meteor.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.translate import meteor
2
+ from nltk import word_tokenize
3
+ import nltk
4
+
5
+ nltk.download('punkt')
6
+ nltk.download('wordnet')
7
+
8
+ #calcola una media tra precision e recall con maggiore enfasi su recall
9
+ score = meteor(
10
+ [word_tokenize('create or update a vm set')],
11
+ word_tokenize('creates or updates a virtual machine scale set')
12
+ )
13
+
14
+ print(f"meteor score: {score}")
testers/rouge-tester.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rouge import Rouge
2
+
3
+ hypothesis = "this is a dog"
4
+
5
+ reference = "this is a dog"
6
+
7
+ rouge = Rouge()
8
+ scores = rouge.get_scores(hypothesis, reference, avg=True)
9
+
10
+ for rouge_type in scores.keys():
11
+ print(rouge_type)
12
+ for score in scores[rouge_type]:
13
+ if(score == 'r'):
14
+ print(f"recall: {scores[rouge_type][score]}")
15
+ if(score == 'p'):
16
+ print(f"precision: {scores[rouge_type][score]}")
17
+ if(score == 'f'):
18
+ print(f"f1_score: {scores[rouge_type][score]}")
19
+ print()