Spaces:
Running
Running
from numpy.linalg import norm | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
import PyPDF2 | |
from nltk.tokenize import sent_tokenize | |
def read_pdf(fname): | |
""" | |
This function reads the pdf file and extracts the text from it. | |
Parameters: | |
fname (str): Name of the pdf file | |
Returns: | |
text_ext (list): List of extracted text from the pdf file | |
""" | |
reader = PyPDF2.PdfReader(fname) | |
text_ext = [] | |
for i in range(len(reader.pages)): | |
pageObj = reader.pages[i] | |
# extracting text from page | |
text_ext.append(pageObj.extract_text()) | |
return text_ext | |
def sent_tokenize(text_ext): | |
""" | |
This function apply sent_tokenize to the text and stores the result in a list. | |
Parameters: | |
text_ext (list): List of extracted text from the pdf file | |
Returns: | |
sent_toks (list): List of tokenized sentences | |
""" | |
sent_toks = [] | |
for i in text_ext: | |
sent_toks.append(sent_tokenize(i)) | |
print("len(sent_toks) ", len(sent_toks)) | |
return sent_toks | |
def create_content_embeddings(concat_list): | |
""" | |
This function creates embeddings for the document sentences. | |
Parameters: | |
concat_list (list): List of tokenized sentences | |
Returns: | |
embeddings (list): List of embeddings of the sentences | |
""" | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
embeddings = model.encode(concat_list) | |
return embeddings | |
def create_query_embeddings(query_text): | |
""" | |
This function creates embeddings for the query. | |
Parameters: | |
query_text (str): Query entered by the user | |
Returns: | |
query_embedding (list): List of embeddings of the query | |
""" | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
query_embedding = model.encode(query_text) | |
return query_embedding | |
def calculate_cosine(query_embedding, embeddings, concat_list): | |
""" | |
This function calculates cosine similarity between the query and the sentences. | |
Parameters: | |
query_embedding (list): List of embeddings of the query | |
embeddings (list): List of embeddings of the sentences | |
concat_list (list): List of tokenized sentences | |
Returns: | |
cosine_lis (list): List of cosine similarity values | |
""" | |
cosine_lis = [] | |
for i in range(len(concat_list)): | |
cosine = np.dot(query_embedding, | |
embeddings[i]) / (norm(query_embedding)*norm(embeddings[i])) | |
cosine_lis.append(cosine) | |
# print("cosine_lis ", cosine_lis) | |
return (cosine_lis) | |
def fetch_top_rank_ans(cosine_lis, N): | |
""" | |
This function fetches the top N ranked sentences. | |
Parameters: | |
cosine_lis (list): List of cosine similarity values | |
N (int): Number of sentences to be ranked | |
Returns: | |
indexes_final (list): List of top N ranked sentences | |
""" | |
list1 = cosine_lis | |
indexes_final = sorted( | |
range(len(list1)), key=lambda i: list1[i], reverse=True)[:N] | |
print("indexes_final ", indexes_final) | |
indices = range(len(list1)) | |
sorted_indices = sorted(indices, key=lambda i: list1[i], reverse=True) | |
# print(sorted_indices) | |
indexes_final = [] | |
for i in range(N): | |
indexes_final.append(sorted_indices[i]) | |
len(indexes_final) | |
return indexes_final | |
def fetch_most_relevant(indexes_final, concat_list, list1, query): | |
""" | |
This function fetches the most relevant sentences, pass it as a context to GPT-3 prompt along with user's query. | |
Parameters: | |
indexes_final (list): List of top N ranked sentences | |
concat_list (list): List of tokenized sentences | |
list1 (list): List of cosine similarity values | |
query (str): Query entered by the user | |
Returns: | |
prompt (str): GPT-3 prompt | |
""" | |
dicts = {} | |
keys = indexes_final | |
for i in keys: | |
dicts[i] = concat_list[i] | |
most_relevant_document_sections = [dicts] | |
len(most_relevant_document_sections) | |
chosen_sections = [] | |
chosen_sections_len = 0 | |
chosen_sections_indexes = [] | |
indices = range(len(list1)) | |
sorted_indices = sorted(indices, key=lambda i: list1[i], reverse=True) | |
# print(len(indexes_final)) | |
for section_index in range(len(indexes_final)): | |
if chosen_sections_len > 500: | |
break | |
chosen_sections.append( | |
concat_list[sorted_indices[section_index]].replace("\n", " ")) | |
chosen_sections_indexes.append(str(section_index)) | |
# Useful diagnostic information | |
print(f"Selected {len(chosen_sections)} document sections:") | |
header = """Answer the question as a human in natural language conversation using the provided context, and if the answer is not contained within the text below, say "I don't have that information"\n\nContext:\n""" | |
# print(query) | |
prompt = header + "".join(chosen_sections) + "\n\n Q: " + query + "\n A:" | |
return prompt | |