lauraparra28 commited on
Commit
df5eb14
·
verified ·
1 Parent(s): ddfb912

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +112 -116
functions.py CHANGED
@@ -1,116 +1,112 @@
1
- import chardet
2
- import torch
3
- from langchain_openai import ChatOpenAI, OpenAI
4
- from langchain_core.prompts import PromptTemplate
5
- from langchain.prompts import PromptTemplate
6
- from sentence_transformers import SentenceTransformer
7
- import os
8
- import pandas as pd
9
- import json
10
-
11
- current_dir = os.getcwd()
12
-
13
- def load_api_key(file_path):
14
- with open(file_path, 'r', encoding='utf-8') as file:
15
- data = json.load(file)
16
- return data.get('api_key')
17
-
18
- def load_dictionary(json_path):
19
- with open(json_path, 'r', encoding='utf-8') as file:
20
- return json.load(file)
21
-
22
- def detect_encoding(file_path):
23
- with open(file_path, 'rb') as file:
24
- raw_data = file.read()
25
- result = chardet.detect(raw_data)
26
- return result['encoding']
27
-
28
- def load_text(file_path):
29
- encoding = detect_encoding(file_path)
30
- with open(file_path, 'r', encoding=encoding) as file:
31
- return file.read()
32
-
33
- def search_query(query, embeddings_tensor, model, segment_contents, file_names, k=5):
34
- query_embedding = torch.tensor(model.encode(query)).unsqueeze(0)
35
- similarities = torch.mm(query_embedding, embeddings_tensor.t()).squeeze(0)
36
- topk_similarities, topk_indices = torch.topk(similarities, k)
37
-
38
- top_segments = [segment_contents[idx] for idx in topk_indices]
39
- top_file_names = [file_names[idx] for idx in topk_indices]
40
- top_similarities = topk_similarities.tolist()
41
-
42
- return top_segments, top_file_names, top_similarities
43
-
44
- def load_embeddings(file_path="embeddings/embeddings.xlsx"):
45
- embeddings_df = pd.read_excel(os.path.join(current_dir, file_path))
46
- embeddings = embeddings_df.iloc[:, :-3].values
47
- segment_contents = embeddings_df['segment_content'].values
48
- num_segment_contents = len(segment_contents)
49
- num_documents = embeddings_df['file_name'].nunique()
50
- file_names = embeddings_df['file_name'].values
51
- model_name = embeddings_df['model_name'].values[0]
52
-
53
- return {
54
- "embeddings": embeddings,
55
- "segment_contents": segment_contents,
56
- "num_documents": num_documents,
57
- "num_segment_contents": num_segment_contents,
58
- "file_names": file_names,
59
- "model_name": model_name,
60
- }
61
-
62
- def generate_answer_with_references(query, data):
63
- embeddings = data["embeddings"]
64
- segment_contents = data["segment_contents"]
65
- model_name = data["model_name"]
66
- file_names = data["file_names"]
67
- embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
68
- model = SentenceTransformer(model_name)
69
- dictionary_path = os.path.join(current_dir, 'documents_names.json')
70
- file_name_dict = load_dictionary(dictionary_path)
71
- file_names = [file_name_dict.get(name, name) for name in file_names]
72
-
73
- top_segments, top_file_names, top_similarities = search_query(query, embeddings_tensor, model, segment_contents, file_names, k=5)
74
- context = "\n----\n".join(top_segments)
75
- prompt_template = """
76
- Você é um assistente de inteligência artificial que responde a perguntas baseadas nos documentos de forma detalhada na forma culta da língua portuguesa.
77
- Não é possível gerar informações ou fornecer informações que não estejam contidas nos documentos recuperados.
78
- Se a informação não se encontra nos documentos, responda com: Não foi possível encontrar a informação requerida nos documentos.
79
-
80
- Contexto:
81
-
82
- {context}
83
-
84
- Pergunta: {query}
85
-
86
- Resposta:""".format(context=context, query=query)
87
-
88
- qa_prompt = PromptTemplate.from_template(prompt_template)
89
- api_key = load_api_key('api_key.json')
90
-
91
- llm = ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo")
92
- response = llm.invoke(qa_prompt.template)
93
- resposta = response.content
94
- total_tokens = response.response_metadata['token_usage']['total_tokens']
95
- prompt_tokens = response.response_metadata['token_usage']['prompt_tokens']
96
-
97
- return resposta, total_tokens, prompt_tokens, top_segments, top_file_names, top_similarities, prompt_template
98
-
99
- def rag_response(query, data, detailed_response):
100
- resposta, total_tokens, prompt_tokens, top_segments, top_file_names, top_similarities, prompt_template = generate_answer_with_references(query, data)
101
- file_names = [x[0] for x in top_file_names]
102
- file_links = {x[0]: x[1] for x in top_file_names}
103
-
104
- if detailed_response==True:
105
- references_detail = "\n\n".join([
106
- f"* Segmento: {segment}\nArquivo: <a href='{file_links[file_name]}' target='_blank'>{file_name}</a>\nSimilaridade: {similarity:.4f}"
107
- for segment, file_name, similarity in zip(top_segments, file_names, top_similarities)])
108
-
109
- formatted_detailed_response = f"Resposta:\n\n{resposta}\n\nPrompt:\n{prompt_template}\n\nPrompt Tokens: {prompt_tokens}\nTotal Tokens: {total_tokens}\n\n{references_detail}"
110
-
111
- return formatted_detailed_response
112
- else:
113
- file_set = set(file_name for file_name in file_names)
114
- references = "\n".join("<a href='{}' target='_blank'>{}</a>".format(file_links[file_name], file_name) for file_name in file_set)
115
- formatted_response = f"{resposta}\n\n----\n{references}"
116
- return formatted_response
 
1
+ import chardet
2
+ import torch
3
+ from langchain_openai import ChatOpenAI, OpenAI
4
+ from langchain_core.prompts import PromptTemplate
5
+ from langchain.prompts import PromptTemplate
6
+ from sentence_transformers import SentenceTransformer
7
+ import os
8
+ import pandas as pd
9
+ import json
10
+
11
+ current_dir = os.getcwd()
12
+ api_key = os.getenv("OPENAI_API_KEY")
13
+
14
+ def load_dictionary(json_path):
15
+ with open(json_path, 'r', encoding='utf-8') as file:
16
+ return json.load(file)
17
+
18
+ def detect_encoding(file_path):
19
+ with open(file_path, 'rb') as file:
20
+ raw_data = file.read()
21
+ result = chardet.detect(raw_data)
22
+ return result['encoding']
23
+
24
+ def load_text(file_path):
25
+ encoding = detect_encoding(file_path)
26
+ with open(file_path, 'r', encoding=encoding) as file:
27
+ return file.read()
28
+
29
+ def search_query(query, embeddings_tensor, model, segment_contents, file_names, k=5):
30
+ query_embedding = torch.tensor(model.encode(query)).unsqueeze(0)
31
+ similarities = torch.mm(query_embedding, embeddings_tensor.t()).squeeze(0)
32
+ topk_similarities, topk_indices = torch.topk(similarities, k)
33
+
34
+ top_segments = [segment_contents[idx] for idx in topk_indices]
35
+ top_file_names = [file_names[idx] for idx in topk_indices]
36
+ top_similarities = topk_similarities.tolist()
37
+
38
+ return top_segments, top_file_names, top_similarities
39
+
40
+ def load_embeddings(file_path="embeddings/embeddings.xlsx"):
41
+ embeddings_df = pd.read_excel(os.path.join(current_dir, file_path))
42
+ embeddings = embeddings_df.iloc[:, :-3].values
43
+ segment_contents = embeddings_df['segment_content'].values
44
+ num_segment_contents = len(segment_contents)
45
+ num_documents = embeddings_df['file_name'].nunique()
46
+ file_names = embeddings_df['file_name'].values
47
+ model_name = embeddings_df['model_name'].values[0]
48
+
49
+ return {
50
+ "embeddings": embeddings,
51
+ "segment_contents": segment_contents,
52
+ "num_documents": num_documents,
53
+ "num_segment_contents": num_segment_contents,
54
+ "file_names": file_names,
55
+ "model_name": model_name,
56
+ }
57
+
58
+ def generate_answer_with_references(query, data):
59
+ embeddings = data["embeddings"]
60
+ segment_contents = data["segment_contents"]
61
+ model_name = data["model_name"]
62
+ file_names = data["file_names"]
63
+ embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
64
+ model = SentenceTransformer(model_name)
65
+ dictionary_path = os.path.join(current_dir, 'documents_names.json')
66
+ file_name_dict = load_dictionary(dictionary_path)
67
+ file_names = [file_name_dict.get(name, name) for name in file_names]
68
+
69
+ top_segments, top_file_names, top_similarities = search_query(query, embeddings_tensor, model, segment_contents, file_names, k=5)
70
+ context = "\n----\n".join(top_segments)
71
+ prompt_template = """
72
+ Você é um assistente de inteligência artificial que responde a perguntas baseadas nos documentos de forma detalhada na forma culta da língua portuguesa.
73
+ Não é possível gerar informações ou fornecer informações que não estejam contidas nos documentos recuperados.
74
+ Se a informação não se encontra nos documentos, responda com: Não foi possível encontrar a informação requerida nos documentos.
75
+
76
+ Contexto:
77
+
78
+ {context}
79
+
80
+ Pergunta: {query}
81
+
82
+ Resposta:""".format(context=context, query=query)
83
+
84
+ qa_prompt = PromptTemplate.from_template(prompt_template)
85
+ api_key = load_api_key('api_key.json')
86
+
87
+ llm = ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo")
88
+ response = llm.invoke(qa_prompt.template)
89
+ resposta = response.content
90
+ total_tokens = response.response_metadata['token_usage']['total_tokens']
91
+ prompt_tokens = response.response_metadata['token_usage']['prompt_tokens']
92
+
93
+ return resposta, total_tokens, prompt_tokens, top_segments, top_file_names, top_similarities, prompt_template
94
+
95
+ def rag_response(query, data, detailed_response):
96
+ resposta, total_tokens, prompt_tokens, top_segments, top_file_names, top_similarities, prompt_template = generate_answer_with_references(query, data)
97
+ file_names = [x[0] for x in top_file_names]
98
+ file_links = {x[0]: x[1] for x in top_file_names}
99
+
100
+ if detailed_response==True:
101
+ references_detail = "\n\n".join([
102
+ f"* Segmento: {segment}\nArquivo: <a href='{file_links[file_name]}' target='_blank'>{file_name}</a>\nSimilaridade: {similarity:.4f}"
103
+ for segment, file_name, similarity in zip(top_segments, file_names, top_similarities)])
104
+
105
+ formatted_detailed_response = f"Resposta:\n\n{resposta}\n\nPrompt:\n{prompt_template}\n\nPrompt Tokens: {prompt_tokens}\nTotal Tokens: {total_tokens}\n\n{references_detail}"
106
+
107
+ return formatted_detailed_response
108
+ else:
109
+ file_set = set(file_name for file_name in file_names)
110
+ references = "\n".join("<a href='{}' target='_blank'>{}</a>".format(file_links[file_name], file_name) for file_name in file_set)
111
+ formatted_response = f"{resposta}\n\n----\n{references}"
112
+ return formatted_response