aka7774 commited on
Commit
06c6776
1 Parent(s): 45b1e82

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +64 -0
  2. fn.py +120 -0
  3. install.bat +56 -0
  4. main.py +43 -0
  5. refresh.py +23 -0
  6. requirements.txt +6 -0
  7. venv.sh +7 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fn
2
+ import gradio as gr
3
+ from refresh import create_refresh_button
4
+
5
+ def fn_search(dir, query):
6
+ args = {
7
+ 'dir': dir,
8
+ 'query': query,
9
+ 'k': 1,
10
+ }
11
+ vector_store = fn.vector_load(args)
12
+ result, detail = fn.search(vector_store, args)
13
+
14
+ return result
15
+
16
+ with gr.Blocks() as demo:
17
+ title = gr.Markdown('# FAISS(Naive RAG)')
18
+ info = gr.Markdown()
19
+ upload_dir = gr.Dropdown(
20
+ choices=fn.load_dirs(),
21
+ label='name',
22
+ show_label=True,
23
+ interactive=True,
24
+ allow_custom_value=True,
25
+ )
26
+
27
+ with gr.Tab('Upload'):
28
+ chunk_size = gr.Textbox(
29
+ value=0,
30
+ label='(optional) chunk_size if split',
31
+ show_label=True,
32
+ interactive=True,
33
+ )
34
+ rag_zip = gr.UploadButton(
35
+ label='Zip Upload and save rag_dir',
36
+ interactive=True,
37
+ )
38
+ create_refresh_button(gr, upload_dir, lambda: None, lambda: {'choices': fn.load_dirs()}, 'refresh-button', interactive=True)
39
+
40
+ with gr.Tab('Search'):
41
+ query = gr.Textbox(
42
+ lines=1,
43
+ label='query',
44
+ show_label=True,
45
+ interactive=True,
46
+ show_copy_button=True,
47
+ )
48
+ search_button = gr.Button(value='search')
49
+ result = gr.Textbox(label='result', show_label=True, show_copy_button=True)
50
+
51
+ search_button.click(
52
+ fn=fn_search,
53
+ inputs=[upload_dir, query],
54
+ outputs=[result],
55
+ )
56
+
57
+ rag_zip.upload(
58
+ fn=fn.upload,
59
+ inputs=[upload_dir, chunk_size, rag_zip],
60
+ outputs=[info],
61
+ )
62
+
63
+ if __name__ == '__main__':
64
+ demo.launch()
fn.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ import requests
5
+
6
+ from langchain.document_loaders.generic import GenericLoader
7
+ from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain_community.vectorstores import FAISS
10
+ from langchain.vectorstores.utils import DistanceStrategy
11
+ from langchain.text_splitter import CharacterTextSplitter
12
+
13
+ os.makedirs('store', exist_ok = True)
14
+
15
+ def download(args: dict):
16
+ if not 'dir' in args:
17
+ raise ValueError('require dir')
18
+
19
+ if 'zip_url' in args:
20
+ res = requests.get(args['zip_url'])
21
+
22
+ with tempfile.NamedTemporaryFile(suffix=".zip") as t:
23
+ with open(t.name, 'wb') as f:
24
+ f.write(res.content)
25
+
26
+ if os.path.exists(f"store/{args['dir']}"):
27
+ shutil.rmtree(f"store/{args['dir']}")
28
+ shutil.unpack_archive(t.name, f"store/{args['dir']}")
29
+ elif 'url' in args:
30
+ os.makedirs(f"store/{args['dir']}", exist_ok=True)
31
+ res = requests.get(args['url'])
32
+
33
+ filepath = f"store/{args['dir']}/{os.path.basename(args['url'])}"
34
+ with open(filepath, 'wb') as f:
35
+ f.write(res.content)
36
+ elif 'text' in args:
37
+ os.makedirs(f"store/{args['dir']}", exist_ok=True)
38
+
39
+ filepath = f"store/{args['dir']}/text.txt"
40
+ with open(filepath, 'w', encoding='utf-8') as f:
41
+ f.write(args['text'])
42
+
43
+ def docs_load(args: dict):
44
+ loader = GenericLoader.from_filesystem(
45
+ path=f"store/{args['dir']}",
46
+ glob="**/[!.]*",
47
+ show_progress=True,
48
+ )
49
+
50
+ docs = loader.load()
51
+ return docs
52
+
53
+ def chunk_split(docs, chunk_size):
54
+ text_splitter = CharacterTextSplitter(
55
+ separator='\n\n',
56
+ chunk_size=chunk_size,
57
+ chunk_overlap=0,
58
+ length_function=len
59
+ )
60
+ chunk_docs = text_splitter.create_documents([doc.page_content for doc in docs])
61
+ return chunk_docs
62
+
63
+ def vector(docs, args: dict):
64
+ embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
65
+ vector_store = FAISS.from_documents(documents=docs,
66
+ embedding=embeddings,
67
+ distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT,
68
+ normalize_L2=True)
69
+ return vector_store
70
+
71
+ def vector_save(docs, args: dict):
72
+ vector_store = vector(docs, args)
73
+ folder_path = f"store/{args['dir']}/vector"
74
+ vector_store.save_local(folder_path=folder_path)
75
+
76
+ return vector_store
77
+
78
+ def vector_load(args: dict):
79
+ folder_path = f"store/{args['dir']}/vector"
80
+ if not os.path.exists(folder_path):
81
+ raise ValueError(f"missing store/{args['dir']}/vector")
82
+
83
+ embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
84
+ vector_store = FAISS.load_local(folder_path=folder_path,
85
+ embeddings=embeddings,
86
+ distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT,
87
+ normalize_L2=True)
88
+ return vector_store
89
+
90
+ def search(vector_store, args: dict):
91
+ results = vector_store.similarity_search_with_score(query=args['query'], k=args['k'])
92
+ detail = []
93
+ for r in results:
94
+ detail.append([r[0].page_content, float(r[1])])
95
+ return results[0][0].page_content, detail
96
+
97
+ def load_dirs():
98
+ dirs = []
99
+ for name in os.listdir('store'):
100
+ dirs.append(name)
101
+
102
+ return dirs
103
+
104
+ def upload(dir, chunk_size, file):
105
+ if not dir:
106
+ raise ValueError('require dir')
107
+
108
+ args = {
109
+ 'dir': dir,
110
+ 'chunk_size': int(chunk_size),
111
+ }
112
+ if os.path.exists(f"store/{args['dir']}"):
113
+ shutil.rmtree(f"store/{args['dir']}")
114
+ shutil.unpack_archive(file.name, f"store/{args['dir']}")
115
+ docs = docs_load(args)
116
+ if args['chunk_size'] > 0:
117
+ docs = chunk_split(docs, int(chunk_size))
118
+ vector_save(docs, args)
119
+
120
+ return f"saved store/{args['dir']}"
install.bat ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ rem -------------------------------------------
4
+ rem NOT guaranteed to work on Windows
5
+
6
+ set REPOS=https://huggingface.co/spaces/aka7774/faiss
7
+ set APPDIR=faiss
8
+ set VENV=venv
9
+
10
+ rem -------------------------------------------
11
+
12
+ set INSTALL_DIR=%~dp0
13
+ cd /d %INSTALL_DIR%
14
+
15
+ :git_clone
16
+ set DL_URL=%REPOS%
17
+ set DL_DST=%APPDIR%
18
+ git clone %DL_URL% %APPDIR%
19
+ if exist %DL_DST% goto install_python
20
+
21
+ set DL_URL=https://github.com/git-for-windows/git/releases/download/v2.41.0.windows.3/PortableGit-2.41.0.3-64-bit.7z.exe
22
+ set DL_DST=PortableGit-2.41.0.3-64-bit.7z.exe
23
+ curl -L -o %DL_DST% %DL_URL%
24
+ if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
25
+ %DL_DST% -y
26
+ del %DL_DST%
27
+
28
+ set GIT=%INSTALL_DIR%PortableGit\bin\git
29
+ %GIT% clone %REPOS%
30
+
31
+ :install_python
32
+ set DL_URL=https://github.com/indygreg/python-build-standalone/releases/download/20240107/cpython-3.10.13+20240107-i686-pc-windows-msvc-shared-install_only.tar.gz
33
+ set DL_DST="%INSTALL_DIR%python.tar.gz"
34
+ curl -L -o %DL_DST% %DL_URL%
35
+ if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
36
+ tar -xzf %DL_DST%
37
+
38
+ set PYTHON=%INSTALL_DIR%python\python.exe
39
+ set PATH=%PATH%;%INSTALL_DIR%python310\Scripts
40
+
41
+ :install_venv
42
+ cd %APPDIR%
43
+ %PYTHON% -m venv %VENV%
44
+ set PYTHON=%VENV%\Scripts\python.exe
45
+
46
+ :install_pip
47
+ set DL_URL=https://bootstrap.pypa.io/get-pip.py
48
+ set DL_DST=%INSTALL_DIR%get-pip.py
49
+ curl -o %DL_DST% %DL_URL%
50
+ if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
51
+ %PYTHON% %DL_DST%
52
+
53
+ %PYTHON% -m pip install gradio
54
+ %PYTHON% -m pip install -r requirements.txt
55
+
56
+ pause
main.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import signal
5
+ import io
6
+
7
+ from fastapi import FastAPI, Request, status, Form, UploadFile
8
+ from fastapi.staticfiles import StaticFiles
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from pydantic import BaseModel, Field
11
+ from fastapi.exceptions import RequestValidationError
12
+ from fastapi.responses import JSONResponse
13
+
14
+ import fn
15
+ import gradio as gr
16
+ from app import demo
17
+
18
+ app = FastAPI()
19
+
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=['*'],
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+ gr.mount_gradio_app(app, demo, path="/gradio")
29
+
30
+ @app.post("/upload")
31
+ async def api_save(args: dict):
32
+ fn.download(args)
33
+ docs = fn.docs_load(args)
34
+ if 'chunk_size' in args and args['chunk_size']:
35
+ docs = fn.chunk_split(docs, int(args['chunk_size']))
36
+ fn.vector_save(docs, args)
37
+ return {"status": 0, "result": "saved."}
38
+
39
+ @app.post("/search")
40
+ async def api_search(args: dict):
41
+ vector_store = fn.vector_load(args)
42
+ result, detail = fn.search(vector_store, args)
43
+ return {"status": 0, "result": result, "detail": detail}
refresh.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def create_refresh_button(gr, refresh_component, refresh_method, refreshed_args, elem_class, interactive=True):
3
+ """
4
+ Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui
5
+ """
6
+ refresh_symbol = '🔄'
7
+ def refresh():
8
+ refresh_method()
9
+ args = refreshed_args() if callable(refreshed_args) else refreshed_args
10
+
11
+ for k, v in args.items():
12
+ setattr(refresh_component, k, v)
13
+
14
+ return gr.update(**(args or {}))
15
+
16
+ refresh_button = gr.Button(refresh_symbol, elem_classes=elem_class, interactive=interactive)
17
+ refresh_button.click(
18
+ fn=refresh,
19
+ inputs=[],
20
+ outputs=[refresh_component]
21
+ )
22
+
23
+ return refresh_button
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ faiss-gpu
4
+ langchain
5
+ sentence-transformers
6
+ python-multipart
venv.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/bash
2
+
3
+ python3 -m venv venv
4
+ curl -kL https://bootstrap.pypa.io/get-pip.py | venv/bin/python
5
+
6
+ venv/bin/python -m pip install gradio
7
+ venv/bin/python -m pip install -r requirements.txt