Spaces:

thefish1
/

space02

Sleeping

App Files Files Community

thefish1 commited on Jul 20, 2024

Commit

6dcc10e

1 Parent(s): 6c98343

u

Browse files

Files changed (2) hide show

app.py +13 -9
vec_db.py +119 -0

app.py CHANGED Viewed

@@ -6,11 +6,11 @@ import re
 from load_data import load_data
 from openai import OpenAI
 from transformers import AutoTokenizer, AutoModel
-from fetch_from_database import encode, insert_keywords_to_weaviate, fetch_summary_from_database,init_database
 import weaviate
 import os
 import subprocess
    # 设置 Matplotlib 的缓存目录
 os.environ['MPLCONFIGDIR'] = '/tmp/matplotlib'
@@ -23,13 +23,18 @@ auth_config = weaviate.AuthApiKey(api_key="8wNsHV3Enc2PNVL8Bspadh21qYAfAvnK2ux3"
-database_client = weaviate.Client(
-  url="https://3a8sbx3s66by10yxginaa.c0.asia-southeast1.gcp.weaviate.cloud",
-  auth_client_secret=auth_config
-)
-class_name="Lhnjames123321"
 tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
 model = AutoModel.from_pretrained("bert-base-chinese")
@@ -194,9 +199,8 @@ def respond(
     query_keywords = list(keywords_dict.keys())
     #此处将max_matches作为距离变量
-    class_name="Lhnjames123321"
-    max_matches,top_keywords_list,top_summary = fetch_summary_from_database(query_keywords,class_name)
     print(f"max_matches: {max_matches}")

 from load_data import load_data
 from openai import OpenAI
 from transformers import AutoTokenizer, AutoModel
+from vec_db import encode_list_to_avg, fetch_response_from_db
 import weaviate
 import os
 import subprocess
+import torch
    # 设置 Matplotlib 的缓存目录
 os.environ['MPLCONFIGDIR'] = '/tmp/matplotlib'
+URL = "https://39nlafviqvard82k6y8btq.c0.asia-southeast1.gcp.weaviate.cloud"
+APIKEY = "Y7c8DRmcxZ4nP5IJLwkznIsK84l6EdwfXwcH"
+# Connect to a WCS instance
+client = weaviate.connect_to_wcs(
+    cluster_url=URL,
+    auth_credentials=weaviate.auth.AuthApiKey(APIKEY))
+class_name="ad_database02"
+device = torch.device(device='cuda' if torch.cuda.is_available() else 'cpu')
 tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
 model = AutoModel.from_pretrained("bert-base-chinese")
     query_keywords = list(keywords_dict.keys())
     #此处将max_matches作为距离变量
+    max_matches,top_keywords_list,top_summary = fetch_response_from_db(query_keywords,class_name)
     print(f"max_matches: {max_matches}")

vec_db.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import weaviate
+import pandas as pd
+import torch
+import json
+from transformers import AutoTokenizer, AutoModel
+import subprocess
+import os
+# 设置 Matplotlib 缓存目录为可写的目录
+os.environ['MPLCONFIGDIR'] = '/tmp/matplotlib'
+# 设置 Hugging Face Transformers 缓存目录为可写的目录
+os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
+#
+# try:
+#         # 运行 Docker 容器的命令
+#         command = [
+#             "docker", "run",
+#             "-p", "8080:8080",
+#             "-p", "50051:50051",
+#             "cr.weaviate.io/semitechnologies/weaviate:1.24.20"
+#         ]
+#
+#         # 执行命令
+#         subprocess.run(command, check=True)
+#         print("Docker container is running.")
+#
+# except subprocess.CalledProcessError as e:
+#         print(f"An error occurred: {e}")
+class_name = 'Lhnjames123321'
+auth_config = weaviate.AuthApiKey(api_key="8wNsHV3Enc2PNVL8Bspadh21qYAfAvnK2ux3")
+client = weaviate.Client(
+  url="https://3a8sbx3s66by10yxginaa.c0.asia-southeast1.gcp.weaviate.cloud",
+  auth_client_secret=auth_config
+)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model = AutoModel.from_pretrained("bert-base-chinese").to(device)
+tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
+def encode_sentences(sentences, model, tokenizer, device):
+    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
+    inputs.to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    embeddings = outputs.last_hidden_state.mean(dim=1)
+    return embeddings.cpu().numpy()
+# def class_exists(client, class_name):
+#     existing_classes = client.schema.get_classes()
+#     return any(cls['class'] == class_name for cls in existing_classes)
+def init_weaviate():
+    # if class_exists(client, class_name)==0:
+    #     class_obj = {
+    #         'class': class_name,
+    #         'vectorIndexConfig': {
+    #             'distance': 'cosine'
+    #         },
+    #     }
+    #     client.schema.create_class(class_obj)
+    file_path = 'data.json'
+    sentence_data = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                data = json.loads(line.strip())
+                sentence1 = data.get('response', '')
+                sentence_data.append(sentence1)
+            except json.JSONDecodeError as e:
+                print(f"Error parsing JSON: {e}")
+                continue
+    sentence_embeddings = encode_sentences(sentence_data, model, tokenizer, device)
+    data = {'sentence': sentence_data,
+            'embeddings': sentence_embeddings.tolist()}
+    df = pd.DataFrame(data)
+    with client.batch(batch_size=100) as batch:
+        for i in range(df.shape[0]):
+            print(f'importing data: {i + 1}/{df.shape[0]}')
+            properties = {
+                'sentence_id': i + 1,
+                'sentence': df.sentence[i],
+            }
+            custom_vector = df.embeddings[i]
+            client.batch.add_data_object(
+                properties,
+                class_name=class_name,
+                vector=custom_vector
+            )
+    print('import completed')
+def use_weaviate(input_str):
+    query = encode_sentences([input_str], model, tokenizer, device)[0].tolist()
+    nearVector = {
+        'vector': query
+    }
+    response = (
+        client.query
+        .get(class_name, ['sentence_id', 'sentence'])
+        .with_near_vector(nearVector)
+        .with_limit(5)
+        .with_additional(['distance'])
+        .do()
+    )
+    print(response)
+    results = response['data']['Get'][class_name]
+    text_list = [result['sentence'] for result in results]
+    return text_list
+if __name__ == '__main__':
+    init_weaviate()
+    input_str = input("请输入查询的文本：")
+    ans = use_weaviate(input_str)
+    print("查询结果：", ans)