yanqiang commited on
Commit
bd111f7
1 Parent(s): 96a6f43
.gitignore CHANGED
@@ -1 +1,3 @@
1
  .idea
 
 
 
1
  .idea
2
+ cache
3
+ docs/zh_wikipedia
README.md CHANGED
@@ -4,7 +4,7 @@
4
 
5
  ## 🔥 效果演示
6
 
7
- ![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/result.png)
8
 
9
  ## 🚀 特性
10
 
@@ -22,6 +22,7 @@
22
  * [ ] 检索结果过滤与排序
23
  * [ ] 互联网检索结果接入
24
  * [ ] 模型初始化有问题
 
25
 
26
  ## 交流
27
  欢迎多提建议、Bad cases,目前尚不完善,欢迎进群及时交流,也欢迎大家多提PR
 
4
 
5
  ## 🔥 效果演示
6
 
7
+ ![](https://github.com/yanqiangmiffy/Chinese-LangChain/blob/master/images/web_demo.png)
8
 
9
  ## 🚀 特性
10
 
 
22
  * [ ] 检索结果过滤与排序
23
  * [ ] 互联网检索结果接入
24
  * [ ] 模型初始化有问题
25
+ * [ ] 增加非LangChain策略
26
 
27
  ## 交流
28
  欢迎多提建议、Bad cases,目前尚不完善,欢迎进群及时交流,也欢迎大家多提PR
cache/index.faiss DELETED
Binary file (53.3 kB)
 
cache/index.pkl DELETED
Binary file (5.43 kB)
 
create_knowledge.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding:utf-8 _*-
3
+ """
4
+ @author:quincy qiang
5
+ @license: Apache Licence
6
+ @file: create_knowledge.py
7
+ @time: 2023/04/18
8
+ @contact: yanqiangmiffy@gamil.com
9
+ @software: PyCharm
10
+ @description: coding..
11
+ """
12
+ from langchain.docstore.document import Document
13
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
14
+ from langchain.vectorstores import FAISS
15
+ from tqdm import tqdm
16
+
17
+ # 中文Wikipedia数据导入示例:
18
+ embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
19
+ docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
20
+ embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
21
+
22
+ docs = []
23
+
24
+ with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f:
25
+ for idx, line in tqdm(enumerate(f.readlines())):
26
+ metadata = {"source": f'doc_id_{idx}'}
27
+ docs.append(Document(page_content=line.strip(), metadata=metadata))
28
+
29
+ vector_store = FAISS.from_documents(docs, embeddings)
30
+ vector_store.save_local('cache/zh_wikipedia/')
images/result.png DELETED
Binary file (72.3 kB)
 
images/web_demo.png ADDED
main.py CHANGED
@@ -10,8 +10,8 @@ os.environ["CUDA_VISIBLE_DEVICES"] = '0'
10
 
11
  # 修改成自己的配置!!!
12
  class LangChainCFG:
13
- llm_model_name = 'THUDM/chatglm-6b-int4-qe' # 本地模型文件 or huggingface远程仓库
14
- embedding_model_name = 'GanymedeNil/text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
15
  vector_store_path = './cache'
16
  docs_path = './docs'
17
 
@@ -91,19 +91,24 @@ with block as demo:
91
  label="large language model",
92
  value="ChatGLM-6B-int4")
93
 
94
- with gr.Tab("select"):
95
- selectFile = gr.Dropdown(file_list,
96
- label="content file",
97
- interactive=True,
98
- value=file_list[0] if len(file_list) > 0 else None)
99
- with gr.Tab("upload"):
100
- file = gr.File(label="请上传知识库文件",
101
- file_types=['.txt', '.md', '.docx', '.pdf']
102
- )
 
 
 
 
 
103
 
104
  file.upload(upload_file,
105
  inputs=file,
106
- outputs=selectFile)
107
  with gr.Column(scale=4):
108
  with gr.Row():
109
  with gr.Column(scale=4):
@@ -137,4 +142,11 @@ with block as demo:
137
  ],
138
  outputs=[message, chatbot, state, search])
139
 
140
- demo.queue(concurrency_count=2).launch(server_name='0.0.0.0', server_port=8888, share=False,show_error=True, enable_queue=True)
 
 
 
 
 
 
 
 
10
 
11
  # 修改成自己的配置!!!
12
  class LangChainCFG:
13
+ llm_model_name = '../../pretrained_models/chatglm-6b-int4-qe' # 本地模型文件 or huggingface远程仓库
14
+ embedding_model_name = '../../pretrained_models/text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
15
  vector_store_path = './cache'
16
  docs_path = './docs'
17
 
 
91
  label="large language model",
92
  value="ChatGLM-6B-int4")
93
 
94
+ top_k = gr.Slider(1,
95
+ 20,
96
+ value=2,
97
+ step=1,
98
+ label="向量匹配 top k",
99
+ interactive=True)
100
+ kg_name = gr.Radio(['中文维基百科', '百度百科数据', '坦克世界'],
101
+ label="知识库",
102
+ value='中文维基百科',
103
+ interactive=True)
104
+ file = gr.File(label="将文件上传到数据库",
105
+ visible=True,
106
+ file_types=['.txt', '.md', '.docx', '.pdf']
107
+ )
108
 
109
  file.upload(upload_file,
110
  inputs=file,
111
+ outputs=None)
112
  with gr.Column(scale=4):
113
  with gr.Row():
114
  with gr.Column(scale=4):
 
142
  ],
143
  outputs=[message, chatbot, state, search])
144
 
145
+ demo.queue(concurrency_count=2).launch(
146
+ server_name='0.0.0.0',
147
+ server_port=8888,
148
+ share=False,
149
+ show_error=True,
150
+ debug=True,
151
+ enable_queue=True
152
+ )
tests/test_duckduckgo_search.py CHANGED
@@ -2,9 +2,9 @@ from duckduckgo_search import ddg
2
  from duckduckgo_search.utils import SESSION
3
 
4
 
5
- SESSION.proxies = {
6
- "http": f"socks5h://localhost:7890",
7
- "https": f"socks5h://localhost:7890"
8
- }
9
  r = ddg("马保国")
10
  print(r)
 
2
  from duckduckgo_search.utils import SESSION
3
 
4
 
5
+ # SESSION.proxies = {
6
+ # "http": f"socks5h://localhost:7890",
7
+ # "https": f"socks5h://localhost:7890"
8
+ # }
9
  r = ddg("马保国")
10
  print(r)
tests/test_langchain.py CHANGED
@@ -4,8 +4,8 @@ from langchain.document_loaders import UnstructuredFileLoader
4
  from langchain.embeddings.huggingface import HuggingFaceEmbeddings
5
  from langchain.vectorstores import FAISS
6
 
7
- embedding_model_name = 'pretrained_models/ernie-gram-zh'
8
- docs_path = 'docs'
9
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
10
 
11
  docs = []
@@ -22,7 +22,7 @@ vector_store.save_local('vector_store_local')
22
  search_result = vector_store.similarity_search_with_score(query='科比', k=2)
23
  print(search_result)
24
 
25
- loader = UnstructuredFileLoader(f'{docs_path}/added/科比.txt', mode="elements")
26
  doc = loader.load()
27
  vector_store.add_documents(doc)
28
  print(doc)
 
4
  from langchain.embeddings.huggingface import HuggingFaceEmbeddings
5
  from langchain.vectorstores import FAISS
6
 
7
+ embedding_model_name = '/home/searchgpt/pretrained_models/ernie-gram-zh'
8
+ docs_path = '/home/searchgpt/yq/Knowledge-ChatGLM/docs'
9
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
10
 
11
  docs = []
 
22
  search_result = vector_store.similarity_search_with_score(query='科比', k=2)
23
  print(search_result)
24
 
25
+ loader = UnstructuredFileLoader(f'{docs_path}/added/马保国.txt', mode="elements")
26
  doc = loader.load()
27
  vector_store.add_documents(doc)
28
  print(doc)