""" 1. 完成了用Qwen通义千问作为知识库查询。 1. 总共有三个区块:知识库回答,应用来源,相关问题。 1. 在Huggingface的API上部署了一个在线BGE的模型,用于回答问题。OpenAI的Emebedding或者Langchain的Embedding都不可以用(会报错: self.d)。 """ ##TODO: 1. 建立一个upload file的模块。 # -*- coding: utf-8 -*- import requests import streamlit as st import openai import os import numpy as np import pandas as pd import csv import tempfile from tempfile import NamedTemporaryFile import pathlib from pathlib import Path import re from re import sub import matplotlib.pyplot as plt from itertools import product from tqdm import tqdm_notebook, tqdm, trange import time from time import sleep from matplotlib.pyplot import style from rich import print import warnings import streamlit_authenticator as stauth # from langchain.vectorstores import FAISS from langchain_community.vectorstores import FAISS from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_core.runnables import RunnableParallel from langchain.llms.base import LLM from langchain.llms.utils import enforce_stop_tokens from typing import Dict, List, Optional, Tuple, Union import requests import json import streamlit as st # import rag_reponse_001 import qwen_response import rag_reponse_002 # import chatgpt # from st_copy_to_clipboard import st_copy_to_clipboard import clipboard import dashscope # warnings.filterwarnings('ignore') from dotenv import load_dotenv load_dotenv() ### 设置openai的API key os.environ["OPENAI_API_KEY"] = os.environ['user_token'] openai.api_key = os.environ['user_token'] bing_search_api_key = os.environ['bing_api_key'] dashscope.api_key = os.environ['dashscope_api_key'] ### Streamlit页面设定。 st.set_page_config(layout="wide") st.title("大语言模型智能知识库查询中心") # st.title("大语言模型本地知识库问答系统") # st.subheader("Large Language Model-based Knowledge Base QA System") # st.warning("_声明:内容由人工智能生成,仅供参考。如果您本人使用或对外传播本服务生成的输出,您应当主动核查输出内容的真实性、准确性,避免传播虚假信息。_") st.caption("_声明:内容由人工智能生成,仅供参考。您应当主动核查输出内容的真实性、准确性,避免传播虚假信息。_") # st.caption("_声明:内容由人工智能生成,仅供参考。如果您本人使用或对外传播本服务生成的输出,您应当主动核查输出内容的真实性、准确性,避免传播虚假信息。_") # st.info("_声明:内容由人工智能生成,仅供参考。如果您本人使用或对外传播本服务生成的输出,您应当主动核查输出内容的真实性、准确性,避免传播虚假信息。_") # st.divider() ### 上传文件的模块 def upload_file(uploaded_file): if uploaded_file is not None: # filename = uploaded_file.name # st.write(filename) # print out the whole file name to validate. not to show in the final version. try: # if '.pdf' in filename: ### original code here. if '.pdf' in uploaded_file.name: pdf_filename = uploaded_file.name ### original code here. filename = uploaded_file.name # print('PDF file:', pdf_filename) # with st.status('正在为您解析新知识库...', expanded=False, state='running') as status: spinner = st.spinner('正在为您解析新知识库...请耐心等待') with spinner: ### 以下是langchain方案。 import langchain_KB import save_database_info uploaded_file_name = "File_provided" temp_dir = tempfile.TemporaryDirectory() # ! working. uploaded_file_path = pathlib.Path(temp_dir.name) / uploaded_file_name with open(pdf_filename, 'wb') as output_temporary_file: # with open(f'./{username}_upload.pdf', 'wb') as output_temporary_file: ### original code here. 可能会造成在引用信息来源时文件名不对的问题。 # ! 必须用这种格式读入内容,然后才可以写入temporary文件夹中。 # output_temporary_file.write(uploaded_file.getvalue()) output_temporary_file.write(uploaded_file.getvalue()) langchain_KB.langchain_localKB_construct(output_temporary_file, username) ## 在屏幕上展示当前知识库的信息,包括名字和加载日期。 save_database_info.save_database_info(f'./{username}/database_name.csv', pdf_filename, str(datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d %H:%M"))) st.markdown('新知识库解析成功,请务必刷新页面,然后开启对话 🔃') return pdf_filename else: # if '.csv' in filename: ### original code here. if '.csv' in uploaded_file.name: print('start the csv file processing...') csv_filename = uploaded_file.name filename = uploaded_file.name csv_file = pd.read_csv(uploaded_file) csv_file.to_csv(f'./{username}/{username}_upload.csv', encoding='utf-8', index=False) st.write(csv_file[:3]) # 这里只是显示文件,后面需要定位文件所在的绝对路径。 else: xls_file = pd.read_excel(uploaded_file) xls_file.to_csv(f'./{username}_upload.csv', index=False) st.write(xls_file[:3]) print('end the csv file processing...') # uploaded_file_name = "File_provided" # temp_dir = tempfile.TemporaryDirectory() # ! working. # uploaded_file_path = pathlib.Path(temp_dir.name) / uploaded_file_name # with open('./upload.csv', 'wb') as output_temporary_file: # with open(f'./{username}_upload.csv', 'wb') as output_temporary_file: # print(f'./{name}_upload.csv') # ! 必须用这种格式读入内容,然后才可以写入temporary文件夹中。 # output_temporary_file.write(uploaded_file.getvalue()) # st.write(uploaded_file_path) #* 可以查看文件是否真实存在,然后是否可以 except Exception as e: st.write(e) ## 以下代码是为了解决上传文件后,文件路径和文件名不对的问题。 # uploaded_file_name = "File_provided" # temp_dir = tempfile.TemporaryDirectory() # # ! working. # uploaded_file_path = pathlib.Path(temp_dir.name) / uploaded_file_name # # with open('./upload.csv', 'wb') as output_temporary_file: # with open(f'./{name}_upload.csv', 'wb') as output_temporary_file: # # print(f'./{name}_upload.csv') # # ! 必须用这种格式读入内容,然后才可以写入temporary文件夹中。 # # output_temporary_file.write(uploaded_file.getvalue()) # output_temporary_file.write(uploaded_file.getvalue()) # # st.write(uploaded_file_path) # * 可以查看文件是否真实存在,然后是否可以 # # st.write('Now file saved successfully.') # return pdf_filename, csv_filename return filename ### upload file # username = 'test' # path = f'./{username}/faiss_index/index.faiss' # if os.path.exists(path): # print(f'{path} local KB exists') # database_info = pd.read_csv(f'./{username}/database_name.csv') # current_database_name = database_info.iloc[-1][0] # current_database_date = database_info.iloc[-1][1] # database_claim = f"当前知识库为:{current_database_name},创建于{current_database_date}。可以开始提问!" # st.markdown(database_claim) # uploaded_file = st.file_uploader( # "选择上传一个新知识库", type=(["pdf"])) # # 默认状态下没有上传文件,None,会报错。需要判断。 # if uploaded_file is not None: # # uploaded_file_path = upload_file(uploaded_file) # upload_file(uploaded_file) # # ## 创建向量数据库 # from langchain.embeddings.openai import OpenAIEmbeddings # embeddings = OpenAIEmbeddings(disallowed_special=()) ## 这里是联网情况下,部署在Huggingface上后使用。 # print('embeddings:', embeddings) # embedding_model_name = 'GanymedeNil/text2vec-large-chinese' # # embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) ## 这里是联网情况下连接huggingface后使用。 # embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/RAG/bge-large-zh') ## 切换成BGE的embedding。 # embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/bge-large-zh/') ## 切换成BGE的embedding。 # embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/chatGLM/My_LocalKB_Project/GanymedeNil_text2vec-large-chinese/') ## 这里会有个“No sentence-transformers model found with name“的warning,但不是error,不影响使用。 ### authentication with a local yaml file. import yaml from yaml.loader import SafeLoader with open('./config.yaml') as file: config = yaml.load(file, Loader=SafeLoader) authenticator = stauth.Authenticate( config['credentials'], config['cookie']['name'], config['cookie']['key'], config['cookie']['expiry_days'], config['preauthorized'] ) user, authentication_status, username = authenticator.login('用户登录', 'main') if authentication_status: with st.sidebar: st.markdown( """