Teapack1 commited on
Commit
c56a0e8
1 Parent(s): 9444ff2

FAISS db store

Browse files
Files changed (3) hide show
  1. fast_app_cz(obsolete).py +0 -110
  2. ingest(obsolete).py +0 -59
  3. ingest.py +0 -38
fast_app_cz(obsolete).py DELETED
@@ -1,110 +0,0 @@
1
- from dotenv import load_dotenv
2
- import os
3
- import json
4
- from fastapi import FastAPI, Request, Form, Response
5
- from fastapi.responses import HTMLResponse
6
- from fastapi.templating import Jinja2Templates
7
- from fastapi.staticfiles import StaticFiles
8
- from fastapi.encoders import jsonable_encoder
9
- from langchain.llms import CTransformers
10
-
11
- from langchain.vectorstores import Chroma
12
- from langchain.text_splitter import RecursiveCharacterTextSplitter
13
-
14
- from langchain.chains import RetrievalQA
15
- from langchain.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
16
- from langchain.llms import OpenAI
17
- from langchain import PromptTemplate
18
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
19
-
20
- app = FastAPI()
21
- load_dotenv()
22
- openai_api_key = os.environ.get("OPENAI_API_KEY")
23
- templates = Jinja2Templates(directory="templates")
24
- app.mount("/static", StaticFiles(directory="static"), name="static")
25
- # embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
26
- embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
27
- persist_directory = "stores/seznampara_ul_512"
28
-
29
- llm = OpenAI(openai_api_key=openai_api_key)
30
- # llm = "model\dolphin-2.6-mistral-7b.Q4_K_S.gguf"
31
- # llm = "neural-chat-7b-v3-1.Q4_K_M.gguf"
32
-
33
-
34
- """
35
- ### - Local LLM settings - ###
36
-
37
- config = {
38
- "max_new_tokens": 1024,
39
- "repetition_penalty": 1.1,
40
- "temperature": 0.1,
41
- "top_k": 50,
42
- "top_p": 0.9,
43
- "stream": True,
44
- "threads": int(os.cpu_count() / 2),
45
- }
46
-
47
- llm = CTransformers(
48
- model=llm, model_type="mistral", lib="avx2", **config # for CPU use
49
- )
50
-
51
- ### - Local LLM settings end - ###
52
- """
53
-
54
- prompt_template = """Use the following pieces of information to answer the user's question.
55
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
56
-
57
- Context: {context}
58
- Question: {question}
59
-
60
- Only return the helpful answer below and nothing else.
61
- Helpful answer:
62
- """
63
-
64
- prompt = PromptTemplate(
65
- template=prompt_template, input_variables=["context", "question"]
66
- )
67
-
68
- print("\n Prompt ready... \n\n")
69
-
70
-
71
- model_name = embedding_model
72
- model_kwargs = {"device": "cpu"}
73
- encode_kwargs = {"normalize_embeddings": False}
74
- embedding = HuggingFaceEmbeddings(
75
- model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
76
- )
77
- vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
78
- retriever = vectordb.as_retriever(search_kwargs={"k": 3})
79
-
80
- print("\n Retrieval Ready....\n\n")
81
-
82
-
83
- @app.get("/", response_class=HTMLResponse)
84
- def read_item(request: Request):
85
- return templates.TemplateResponse("index.html", {"request": request})
86
-
87
-
88
- @app.post("/get_response")
89
- async def get_response(query: str = Form(...)):
90
-
91
- chain_type_kwargs = {"prompt": prompt}
92
- qa_chain = RetrievalQA.from_chain_type(
93
- llm=llm,
94
- chain_type="stuff",
95
- retriever=retriever,
96
- return_source_documents=True,
97
- chain_type_kwargs=chain_type_kwargs,
98
- verbose=True,
99
- )
100
- response = qa_chain(query)
101
- print(response)
102
- answer = response["result"]
103
- source_document = response["source_documents"][0].page_content
104
- doc = response["source_documents"][0].metadata["source"]
105
- response_data = jsonable_encoder(
106
- json.dumps({"answer": answer, "source_document": source_document, "doc": doc})
107
- )
108
-
109
- res = Response(response_data)
110
- return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ingest(obsolete).py DELETED
@@ -1,59 +0,0 @@
1
- from langchain.vectorstores import Chroma
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
-
4
- from langchain.document_loaders import (
5
- PyPDFLoader,
6
- DirectoryLoader,
7
- UnstructuredFileLoader,
8
- )
9
- from langchain.document_loaders.csv_loader import CSVLoader
10
- from langchain.embeddings import (
11
- OpenAIEmbeddings,
12
- HuggingFaceBgeEmbeddings,
13
- HuggingFaceEmbeddings,
14
- HuggingFaceInstructEmbeddings,
15
- )
16
-
17
-
18
- persist_directory = "stores/test_512"
19
- data = "data\czech"
20
- chunk = 512
21
- overlap = 128
22
- # embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
23
- embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
24
-
25
- model_name = embedding_model
26
- model_kwargs = {"device": "cpu"}
27
- encode_kwargs = {"normalize_embeddings": False}
28
- embedding = HuggingFaceEmbeddings(
29
- model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
30
- )
31
-
32
- """
33
- loader = CSVLoader(
34
- file_path="data/emails.csv",
35
- encoding="utf-8",
36
- csv_args={
37
- "delimiter": ";",
38
- },
39
- )
40
-
41
- """
42
-
43
- loader = DirectoryLoader(data, show_progress=True)
44
-
45
-
46
- documents = loader.load()
47
- text_splitter = RecursiveCharacterTextSplitter(
48
- chunk_size=chunk,
49
- chunk_overlap=overlap,
50
- )
51
- texts = text_splitter.split_documents(documents)
52
- vectordb = Chroma.from_documents(
53
- documents=texts,
54
- embedding=embedding,
55
- persist_directory=persist_directory,
56
- collection_metadata={"hnsw:space": "cosine"},
57
- )
58
-
59
- print("\n Vector Store Created.......\n\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ingest.py CHANGED
@@ -90,41 +90,3 @@ class Ingest:
90
  vectordb.save_local(self.czech_store)
91
 
92
  print("\n Czech vector Store Created.......\n\n")
93
-
94
-
95
- """
96
-
97
-
98
-
99
- openai_api_key = "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"
100
- persist_directory = "stores/store_512"
101
- data = "data/"
102
- chunk = 512
103
- overlap = 256
104
-
105
- embedding = OpenAIEmbeddings(
106
- openai_api_key=openai_api_key,
107
- model="text-embedding-3-large",
108
- # model_kwargs={"device": "cpu"},
109
- )
110
-
111
- loader = DirectoryLoader(
112
- data, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
113
- )
114
- documents = loader.load()
115
- text_splitter = RecursiveCharacterTextSplitter(
116
- chunk_size=chunk,
117
- chunk_overlap=overlap,
118
- )
119
- texts = text_splitter.split_documents(documents)
120
-
121
- vectordb = Chroma.from_documents(
122
- documents=texts,
123
- embedding=embedding,
124
- persist_directory=persist_directory,
125
- collection_metadata={"hnsw:space": "cosine"},
126
- )
127
-
128
- print("\n Vector Store Created.......\n\n")
129
-
130
- """
 
90
  vectordb.save_local(self.czech_store)
91
 
92
  print("\n Czech vector Store Created.......\n\n")