adhinojosa commited on
Commit
cc3f463
1 Parent(s): 07c2c7d

Create vectorstore.py

Browse files
Files changed (1) hide show
  1. vectorstore.py +94 -0
vectorstore.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import re
3
+ import chromadb
4
+ from chromadb.utils import embedding_functions
5
+ import uuid
6
+ import torch
7
+ from langchain.text_splitter import SentenceTransformersTokenTextSplitter
8
+ from sentence_transformers import CrossEncoder
9
+
10
+
11
+ emb_model_name = "sentence-transformers/all-mpnet-base-v2"
12
+ sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")
13
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
14
+
15
+ client = chromadb.PersistentClient(path='.vectorstore')
16
+
17
+ collection = client.get_or_create_collection(name='huerto',embedding_function=sentence_transformer_ef,metadata={"hnsw:space": "cosine"})
18
+
19
+ def parse_pdf(file) :
20
+ '''transforma un pdf en una lista'''
21
+ pdf = fitz.open(file)
22
+ output = []
23
+ for page_num in range(pdf.page_count):
24
+ page = pdf[page_num]
25
+ text = page.get_text()
26
+ # Merge hyphenated words
27
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
28
+ # Fix newlines in the middle of sentences
29
+ text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
30
+ # Remove multiple newlines
31
+ text = re.sub(r"\n\s*\n", "\n\n", text)
32
+ output.append(text)
33
+ return output
34
+
35
+
36
+ def file_to_splits(file,tokens_per_chunk,chunk_overlap ):
37
+ '''Transforma un txt o pdf en una en una lista que contiene piezas con metadata'''
38
+ text_splitter = SentenceTransformersTokenTextSplitter(
39
+ model_name=emb_model_name,
40
+ tokens_per_chunk=tokens_per_chunk,
41
+ chunk_overlap=chunk_overlap,
42
+ )
43
+
44
+
45
+ text = parse_pdf(file)
46
+
47
+ doc_chunks = []
48
+ for i in range(len(text)):
49
+ chunks = text_splitter.split_text(text[i])
50
+ for j in range(len(chunks)):
51
+ doc = [chunks[j], {"source": file.split('/')[-1] ,"page": i+1, "chunk": j+1}, str(uuid.uuid4())]
52
+ doc_chunks.append(doc)
53
+ return doc_chunks
54
+
55
+
56
+ def file_to_vs(file,tokens_per_chunk, chunk_overlap):
57
+ try:
58
+
59
+ splits=[]
60
+
61
+ splits.extend(file_to_splits(file,
62
+ tokens_per_chunk,
63
+ chunk_overlap))
64
+ splits = list(zip(*splits))
65
+
66
+ collection.add(documents=list(splits[0]), metadatas=list(splits[1]), ids= list(splits[2]))
67
+
68
+
69
+
70
+
71
+
72
+ return 'Files uploaded successfully'
73
+ except Exception as e:
74
+
75
+ return str(e)
76
+
77
+ def similarity_search(query,k):
78
+ sources = {}
79
+ ss_out= collection.query(query_texts=[query],n_results=20)
80
+ for _ in range(len(ss_out['ids'][0])):
81
+ score = float(cross_encoder.predict([query,ss_out['documents'][0][_]],activation_fct=torch.nn.Sigmoid()))
82
+ sources[str(_)]={"page_content":ss_out['documents'][0][_],"metadata":ss_out['metadatas'][0][_],"similarity":round(score*100,2)}
83
+
84
+
85
+ sorted_sources = sorted(sources.items(), key=lambda x: x[1]['similarity'], reverse=True)
86
+
87
+ sources = {}
88
+ for _ in range(k):
89
+ sources[str(_)] = sorted_sources[_][1]
90
+
91
+
92
+ return sources
93
+
94
+