cta2106 commited on
Commit
6431a8f
1 Parent(s): a532d6c

first commit

Browse files
Files changed (7) hide show
  1. .DS_Store +0 -0
  2. .idea/.gitignore +8 -0
  3. api.py +42 -0
  4. appsearch.py +106 -0
  5. config.py +1 -0
  6. requirements.txt +6 -0
  7. utils.py +23 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
api.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from utils import get_us_speeches
3
+ from config import UPDATE_SPEECHES
4
+
5
+ from haystack.document_stores import ElasticsearchDocumentStore
6
+ from haystack.nodes import ElasticsearchRetriever
7
+ from haystack.nodes import FARMReader
8
+ from haystack.pipelines import ExtractiveQAPipeline
9
+
10
+ import gradio as gr
11
+
12
+
13
+ document_store = ElasticsearchDocumentStore(
14
+ host='fgm-v2.es.eastus2.azure.elastic-cloud.com',
15
+ username='elastic',
16
+ password='cxjWqZfmhcfhzpWmfX57ylJc',
17
+ scheme='https',
18
+ port=9243,
19
+ index='us-speeches'
20
+
21
+ )
22
+
23
+ if UPDATE_SPEECHES:
24
+ us_speeches = get_us_speeches()
25
+ document_store.write_documents(us_speeches)
26
+
27
+ retriever = ElasticsearchRetriever(
28
+ document_store=document_store
29
+ )
30
+
31
+ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
32
+
33
+ pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
34
+
35
+ app = FastAPI()
36
+
37
+
38
+ async def run_query(query: str):
39
+ return pipeline.run(query=query)
40
+
41
+
42
+ gr.Interface(predict_fn, "textbox", ["label", "label"]).launch()
appsearch.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List, Union, Dict
3
+ from urllib.parse import urljoin
4
+ import requests
5
+
6
+
7
+ class AppSearchClient:
8
+ def __init__(self):
9
+ self.appsearch_endpoint = "https://fgm-v2.ent.eastus2.azure.elastic-cloud.com"
10
+ self.appsearch_private_key = "private-dzf1pbcssw97hxkm3wxbdrpu"
11
+ self.headers = {
12
+ "Content-Type": "application/json",
13
+ "Authorization": f"Bearer {self.appsearch_private_key}",
14
+ }
15
+ assert self.appsearch_endpoint is not None
16
+ assert self.appsearch_private_key is not None
17
+
18
+ def list_all_engines(self) -> List[str]:
19
+ ENGINES_URL = "/api/as/v1/engines/"
20
+ request_url = urljoin(self.appsearch_endpoint, ENGINES_URL)
21
+ MAX_DOCS_PER_PAGE = 10
22
+ current_page = 1
23
+ while True:
24
+ params = (
25
+ ("page[size]", f"{MAX_DOCS_PER_PAGE}"),
26
+ ("page[current]", f"{current_page}"),
27
+ )
28
+ r = requests.get(request_url, headers=self.headers, params=params).json()
29
+ for item in r["results"]:
30
+ yield item["name"]
31
+ current_page += 1
32
+ if not len(r["results"]):
33
+ break
34
+
35
+ def create_engine(self, name) -> requests.Response:
36
+ ENGINES_URL = "/api/as/v1/engines/"
37
+ request_url = urljoin(self.appsearch_endpoint, ENGINES_URL)
38
+ data = json.dumps({"name": name}, indent=4, sort_keys=True)
39
+ r = requests.post(request_url, headers=self.headers, data=data)
40
+ return r
41
+
42
+ def index_documents(self, data: Union[Dict, List[Dict]], engine_name: str) -> None:
43
+ INDEX_URL = f"/api/as/v1/engines/{engine_name}/documents"
44
+ request_url = urljoin(self.appsearch_endpoint, INDEX_URL)
45
+ r = requests.post(
46
+ request_url,
47
+ headers=self.headers,
48
+ data=json.dumps(data, indent=4, sort_keys=True),
49
+ )
50
+
51
+ def list_existing_docs(self, engine_name) -> List[Dict]:
52
+ LIST_URL = f"/api/as/v1/engines/{engine_name}/documents/list"
53
+ MAX_DOCS_PER_PAGE = 100
54
+ request_url = urljoin(self.appsearch_endpoint, LIST_URL)
55
+ current_page = 1
56
+ docs = list()
57
+ while True:
58
+ params = (
59
+ ("page[size]", f"{MAX_DOCS_PER_PAGE}"),
60
+ ("page[current]", f"{current_page}"),
61
+ )
62
+ page_content = json.loads(
63
+ requests.get(request_url, headers=self.headers, params=params).text
64
+ )["results"]
65
+ docs.extend(page_content)
66
+ current_page += 1
67
+ if not page_content:
68
+ break
69
+ return docs
70
+
71
+ def list_existing_manual_urls(self, engine_name: str) -> List[Dict]:
72
+ for doc in self.list_existing_docs(engine_name):
73
+ if doc["is_manual"] == "true":
74
+ yield doc["id"]
75
+
76
+ def list_existing_non_manual_urls(self, engine_name: str) -> List[Dict]:
77
+ for doc in self.list_existing_docs(engine_name):
78
+ if doc["is_manual"] == "false":
79
+ yield doc["id"]
80
+
81
+ def list_existing_urls(self, engine_name: str) -> List[str]:
82
+ for doc in self.list_existing_docs(engine_name):
83
+ yield doc["id"]
84
+
85
+ def get_elastic_query(self, data: str, size: int):
86
+ return requests.post(
87
+ url=f"{self.appsearch_endpoint}/api/as/v0/engines/us-speeches-s/elasticsearch/_search?size={size}",
88
+ headers=self.headers, data=data)
89
+
90
+ def delete_existing_non_manual_docs(self, engine_name: str) -> None:
91
+ non_manual_doc_ids = list(self.list_existing_non_manual_urls(engine_name))
92
+ DELETE_URL = f"/api/as/v1/engines/{engine_name}/documents"
93
+ MAX_DOCS_TO_DELETE_PER_REQUEST = 100
94
+ request_url = urljoin(self.appsearch_endpoint, DELETE_URL)
95
+
96
+ def chunker(seq, size):
97
+ return (seq[pos: pos + size] for pos in range(0, len(seq), size))
98
+
99
+ for idx, group in enumerate(
100
+ chunker(non_manual_doc_ids, MAX_DOCS_TO_DELETE_PER_REQUEST)
101
+ ):
102
+ r = requests.delete(
103
+ request_url,
104
+ headers=self.headers,
105
+ data=json.dumps(group, indent=4, sort_keys=True),
106
+ )
config.py ADDED
@@ -0,0 +1 @@
 
 
1
+ UPDATE_SPEECHES = False
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ elastic-enterprise-search==8.4.0
2
+ farm-haystack
3
+ requests~=2.28.1
4
+ fastapi~=0.86.0
5
+ torch
6
+ torchvision
utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+
3
+ from appsearch import AppSearchClient
4
+
5
+
6
+ def get_us_speeches() -> List[Dict]:
7
+ appsearch = AppSearchClient()
8
+
9
+ us_speeches = appsearch.list_existing_docs("us-speeches")
10
+
11
+ for items in us_speeches:
12
+ if "_meta" in items:
13
+ del items["_meta"]
14
+
15
+ us_speeches_dict = [
16
+ {
17
+ 'content': speech["text"],
18
+ 'meta': {'filename': speech["filename"], 'speaker': speech["speaker"], 'date': speech["date"],
19
+ 'url': speech["url"]}
20
+ } for speech in us_speeches
21
+ ]
22
+
23
+ return us_speeches_dict