Spaces:
Running
Running
File size: 7,102 Bytes
0be93a7 990db67 0be93a7 15aca52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
from loguru import logger
import json
from bin_public.utils.utils_db import *
from bin_public.config.presets import MIGRAINE_PROMPT
import PyPDF2
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
def load_local_file_PDF(path, file_name):
result = {}
temp = ''
pdf_reader = PyPDF2.PdfReader(open(path, 'rb'))
for i in range(len(pdf_reader.pages)):
pages = pdf_reader.pages[i]
temp += pages.extract_text()
if file_name.endswith('.pdf'):
index = file_name[:-4]
temp = temp.replace('\n', '').replace('\t', '')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(temp)
i = 0
for content in texts:
result[f'{index}_{i}'] = content
i += 1
return result
def holo_query_insert_file_contents(file_name, file_content):
run_sql = f"""
insert into s_context(
file_name,
content
)
select
'{file_name}' as file_name,
'{file_content}' as content
"""
holo_query_func(run_sql, is_query=0)
def holo_query_get_content(run_sql):
temp = []
data = holo_query_func(run_sql, is_query=1)
for i in data:
temp.append(i[1].replace('\n', '').replace('\t', ''))
return temp
def pdf2database(path, file_name):
temp = ''
pdf_reader = PyPDF2.PdfReader(open(path, 'rb'))
for i in range(len(pdf_reader.pages)):
pages = pdf_reader.pages[i]
temp += pages.extract_text()
if file_name.endswith('.pdf'):
index = file_name[:-4]
temp = temp.replace('\n', '').replace('\t', '')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(temp)
for i in range(len(texts)):
holo_query_insert_file_contents(f'{index}_{i}', f'{texts[i]}')
logger.info(f'{index}_{i} stored')
def load_json(path):
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def get_content_from_json(path):
result = []
data = load_json(path)
for item in data:
key = list(item.keys())[0]
value = item[key]
result.append(key + ',' + value)
return result
def data2embeddings(index_name, data, embeddings):
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
Pinecone.from_texts([t for t in data], embeddings, index_name=index_name)
logger.info("Stored Successfully")
def context_construction(api_key, query, model, pinecone_api_key, pinecone_api_env, temperature, index_name, mode="map_reduce"):
temp = []
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
# llm = OpenAI(temperature=temperature, openai_api_key=api_key, model_name=model)
pinecone.init(api_key=pinecone_api_key, environment=pinecone_api_env)
docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)
# chain = load_qa_chain(llm, chain_type=mode)
if not any(char.isalnum() for char in query):
return " ", MIGRAINE_PROMPT, "Connecting to Pinecone"
else:
docs = docsearch.similarity_search(query, include_metadata=True, k=2)
# response = chain.run(input_documents=docs, question=str(query))
for i in docs:
temp.append(i.page_content)
return '用以下资料进行辅助回答\n' + ' '.join(temp), '\n' + ' '.join(temp), "Connecting to Pinecone"
def chat_prerequisites(input, filter, embeddings, top_k=4):
# filter : dic
# input_prompt = '只基于以下规范的两种分类对形如 "position_name: xx job_name: xx job_description: xxx"的描述进行分类,只要回复规范的类别名'
input_prompt = '接下来我会给你一段"不规范的招聘职位描述",以及4个用(选项一,选项二,选项三,选项四)四个选项表示的规范的职业分类描述。' \
'你需要将"不规范的招聘职位描述"归类为”选项一“或“选项二”或“选项三”或“选项四”。' \
'你只需要回复”选项一“或“选项二”或“选项三”或“选项四”,不要回复任何别的东西'
query = input_prompt + input
temp = []
docsearch = Pinecone.from_existing_index(index_name=pinecone.list_indexes()[0], embedding=embeddings)
docs = docsearch.similarity_search(query, k=top_k, filter=filter)
for index, i in enumerate(docs):
if index == 0:
temp.append("选项一:" + i.page_content + "##")
if index == 1:
temp.append("选项二:" + i.page_content + "##")
if index == 2:
temp.append("选项三:" + i.page_content + "##")
if index == 3:
temp.append("选项四:" + i.page_content + "##")
system_prompt = ' '.join(temp)
return system_prompt, query
def chat(input, filter, embeddings):
system_prompt, query = chat_prerequisites(input, filter, embeddings)
logger.info('prerequisites satisfied')
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": query}
])
return completion.choices[0].message['content'], system_prompt
def chat_data_cleaning(input):
clean_prompt = '我要求你提取出这段文字中的岗位名称、岗位描述(用一句或者两句话概括),去除无关紧要的信息,比如工资,地点等等,并严格遵守"岗位名称: xxx # 岗位描述: xxx # "的格式进行回复'
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": clean_prompt},
{"role": "user", "content": clean_prompt + input}
])
return completion.choices[0].message['content']
def local_emb2pinecone(PINECONE_API_KEY, PINECONE_API_ENV, level, emb_path, text_path, delete=False):
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
logger.info('Pinecone initialized')
logger.info(pinecone.list_indexes()[0])
l = load_json(emb_path)
print(f'level{level} loaded')
with open(text_path, 'r', encoding='utf-8') as f:
texts = f.readlines()
texts = [i.replace('\n', '') for i in texts]
index = pinecone.Index(pinecone.list_indexes()[0])
if delete:
if input('press y to delete all the vectors: ') == 'y':
index.delete(delete_all=True)
logger.info('delete all')
else:
pass
else:
pass
for key, value, text in zip(list(l.keys()), list(l.values()), texts):
index.upsert([(key, value, {"text": text, "level": level})])
logger.info('upload successfully') |