AI-ANK commited on
Commit
1ff4ca8
1 Parent(s): 7fa5d95

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. NY-House-Dataset.csv +0 -0
  3. NY_House_Dataset.db +3 -0
  4. app.py +254 -0
  5. requirements.txt +8 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ NY_House_Dataset.db filter=lfs diff=lfs merge=lfs -text
NY-House-Dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
NY_House_Dataset.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d609196693aaabd6751e61a1cc8a09f375ca41cea51eb019097803638093e3a0
3
+ size 1331200
app.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Streamlit application for New York Housing Market Explorer
2
+
3
+ # Required imports
4
+ import streamlit as st
5
+ import pandas as pd
6
+ from llama_index import SimpleDirectoryReader, ServiceContext, StorageContext, VectorStoreIndex
7
+ from llama_index.llms import OpenAI
8
+ from llama_index.embeddings import FastEmbedEmbedding
9
+ from qdrant_client import QdrantClient
10
+ import json
11
+ import os
12
+ from sqlalchemy import create_engine
13
+ from llama_index import SQLDatabase, ServiceContext
14
+ from llama_index.indices.struct_store import NLSQLTableQueryEngine
15
+ from pathlib import Path
16
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
17
+ from llama_index.query_engine import (
18
+ SQLAutoVectorQueryEngine,
19
+ RetrieverQueryEngine,
20
+ )
21
+ from llama_index.tools.query_engine import QueryEngineTool
22
+ from llama_index.indices.vector_store import VectorIndexAutoRetriever
23
+
24
+ from llama_index.indices.vector_store.retrievers import (
25
+ VectorIndexAutoRetriever,
26
+ )
27
+ from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo
28
+ from llama_index.query_engine.retriever_query_engine import (
29
+ RetrieverQueryEngine,
30
+ )
31
+
32
+ st.set_page_config(layout="wide")
33
+ write_dir = Path("textdata")
34
+
35
+ # Initialize Qdrant client
36
+ client = QdrantClient(
37
+ url=os.environ['QDRANT_URL'],
38
+ api_key=os.environ['QDRANT_API_KEY'],
39
+ )
40
+
41
+ # Initialize LLM and embedding model
42
+ llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo")
43
+ embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
44
+ service_context = ServiceContext.from_defaults(chunk_size_limit=1024, llm=llm, embed_model=embed_model)
45
+
46
+ vector_store = QdrantVectorStore(client=client, collection_name="housing2")
47
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
48
+
49
+ #Create vector indexes and store in Qdrant. To be run only once in the beginning
50
+ #from llama_index import VectorStoreIndex
51
+ #index = VectorStoreIndex.from_documents(documents, vector_store=vector_store, service_context=service_context, storage_context=storage_context)
52
+
53
+ # Load the vector index from Qdrant collection
54
+ index = VectorStoreIndex.from_vector_store(
55
+ vector_store, storage_context=storage_context
56
+ )
57
+
58
+
59
+ # Function to extract and format text data from a dataframe row
60
+ def get_text_data(data):
61
+ return f"""
62
+ BROKERTITLE: {data['BROKERTITLE']}
63
+ TYPE: {data['TYPE']}
64
+ PRICE: {data['PRICE']}
65
+ BEDS: {data['BEDS']}
66
+ BATH: {data['BATH']}
67
+ PROPERTYSQFT: {data['PROPERTYSQFT']}
68
+ ADDRESS: {data['ADDRESS']}
69
+ STATE: {data['STATE']}
70
+ MAIN_ADDRESS: {data['MAIN_ADDRESS']}
71
+ ADMINISTRATIVE_AREA_LEVEL_2: {data['ADMINISTRATIVE_AREA_LEVEL_2']}
72
+ LOCALITY: {data['LOCALITY']}
73
+ SUBLOCALITY: {data['SUBLOCALITY']}
74
+ STREET_NAME: {data['STREET_NAME']}
75
+ LONG_NAME: {data['LONG_NAME']}
76
+ FORMATTED_ADDRESS: {data['FORMATTED_ADDRESS']}
77
+ LATITUDE: {data['LATITUDE']}
78
+ LONGITUDE: {data['LONGITUDE']}
79
+ """
80
+ def create_text_and_embeddings():
81
+ # Write text data to 'textdata' folder and creating individual files
82
+ if write_dir.exists():
83
+ print(f"Directory exists: {write_dir}")
84
+ [f.unlink() for f in write_dir.iterdir()]
85
+ else:
86
+ print(f"Creating directory: {write_dir}")
87
+ write_dir.mkdir(exist_ok=True, parents=True)
88
+
89
+ for index, row in df.iterrows():
90
+ if "text" in row:
91
+ file_path = write_dir / f"Property_{index}.txt"
92
+ with file_path.open("w") as f:
93
+ f.write(str(row["text"]))
94
+ else:
95
+ print(f"No 'text' column found at index {index}")
96
+
97
+ print(f"Files created in {write_dir}")
98
+ #create_text_and_embeddings() #execute only once in the beginning
99
+
100
+ @st.cache_data
101
+ def load_data():
102
+ if write_dir.exists():
103
+ reader = SimpleDirectoryReader(input_dir="textdata")
104
+ documents = reader.load_data()
105
+ return documents
106
+
107
+ documents = load_data()
108
+
109
+ # Streamlit UI setup
110
+ st.title('New York Housing Market Explorer')
111
+
112
+ # Load the dataset
113
+ df_file_path = 'NY-House-Dataset.csv' # Path to the csv file
114
+ if os.path.exists(df_file_path):
115
+ df = pd.read_csv(df_file_path)
116
+ df["text"] = df.apply(get_text_data, axis=1)
117
+ st.dataframe(df) # Display df in the UI
118
+ else:
119
+ st.error("Data file not found. Please check the path and ensure it's correct.")
120
+
121
+ # Input from user
122
+ user_query = st.text_input("Enter your query:", "Suggest 3 houses in Manhattan brokered by compass.")
123
+
124
+ # Define the options for the radio button
125
+ options = ['Simple: Qdrant Similarity Search + LLM Call (works well for filtering type of queries)', 'Advanced: Qdrant Similarity Search + Llamaindex Text-to-SQL']
126
+
127
+ # Create a radio button for the options
128
+ selection = st.radio("Choose an option:", options)
129
+
130
+ # Processing the query
131
+ if st.button("Submit Query"):
132
+ # Execute different blocks of code based on the selection
133
+ if selection == 'Simple: Qdrant Similarity Search + LLM Call (works well for filtering type of queries)':
134
+ # Part 1, semantic search + LLM call
135
+ # Generate query vector
136
+ query_vector = embed_model.get_query_embedding(user_query)
137
+ # Perform search with Qdrant
138
+ response = client.search(collection_name="housing2", query_vector=query_vector, limit=10)
139
+ # Processing and displaying the results
140
+ text = ''
141
+ properties_list = [] # List to store multiple property dictionaries
142
+ for scored_point in response:
143
+ # Access the payload, then parse the '_node_content' JSON string to get the 'text'
144
+ node_content = json.loads(scored_point.payload['_node_content'])
145
+ text += f"\n{node_content['text']}\n"
146
+ # Initialize a new dictionary for the current property
147
+ property_dict = {}
148
+ for line in node_content['text'].split('\n'):
149
+ if line.strip(): # Ensure line is not empty
150
+ key, value = line.split(': ', 1)
151
+ property_dict[key.strip()] = value.strip()
152
+ # Add the current property dictionary to the list
153
+ properties_list.append(property_dict)
154
+
155
+ # properties_list contains all the retrieved property dictionaries
156
+ with st.status("Retrieving points/nodes based on user query", expanded = True) as status:
157
+ for property_dict in properties_list:
158
+ st.json(json.dumps(property_dict, indent=4))
159
+ print(property_dict)
160
+ status.update(label="Retrieved points/nodes based on user query", state="complete", expanded=False)
161
+
162
+ with st.status("Simple Method: Generating response based on Similarity Search + LLM Call", expanded = True) as status:
163
+ prompt_template = f"""
164
+ Using the below context information respond to the user query.
165
+ context: '{properties_list}'
166
+ query: '{user_query}'
167
+ Response structure should look like this:
168
+ *Detailed Response*
169
+
170
+ *Relevant Details in Table Format*
171
+
172
+ Also, generate the latitude and longitude for all the properties included in the response in JSON object format. For example, if there are properties at 40.761255, -73.974483 and 40.7844489, -73.9807532, the JSON object should look like this limited with 3 backticks. JUST OUTPUT THE JSON, NO NEED TO INCLUDE ANY TITLE OR TEXT BEFORE IT:
173
+
174
+ ```[
175
+ {{
176
+ "latitude": 40.761255,
177
+ "longitude": -73.974483
178
+ }},
179
+ {{
180
+ "latitude": 40.7844489,
181
+ "longitude": -73.9807532
182
+ }}
183
+ ]```
184
+
185
+ """
186
+ llm_response = llm.complete(prompt_template)
187
+ response_parts = llm_response.text.split('```')
188
+ st.markdown(response_parts[0])
189
+
190
+ elif selection == 'Advanced: Qdrant Similarity Search + Llamaindex Text-to-SQL':
191
+ #Part 2, Semantic Search + Text-to-SQL
192
+ with st.status("Advanced Method: Generating response based on Qdrant Similarity Search + Llamaindex Text-to-SQL", expanded = True):
193
+ df2 = df.drop('text', axis=1)
194
+ #Create a SQLite database and engine
195
+ engine = create_engine("sqlite:///NY_House_Dataset.db?mode=ro", connect_args={"uri": True})
196
+ sql_database = SQLDatabase(engine)
197
+ #Convert the DataFrame to a SQL table within the SQLite database
198
+ df2.to_sql('housing_data_sql', con=engine, if_exists='replace', index=False)
199
+
200
+ #Build sql query engine
201
+ sql_query_engine = NLSQLTableQueryEngine(
202
+ sql_database=sql_database
203
+ )
204
+
205
+ vector_store_info = VectorStoreInfo(
206
+ content_info="Housing data details for NY",
207
+ metadata_info = [
208
+ MetadataInfo(name="BROKERTITLE", type="str", description="Title of the broker"),
209
+ MetadataInfo(name="TYPE", type="str", description="Type of the house"),
210
+ MetadataInfo(name="PRICE", type="float", description="Price of the house"),
211
+ MetadataInfo(name="BEDS", type="int", description="Number of bedrooms"),
212
+ MetadataInfo(name="BATH", type="float", description="Number of bathrooms"),
213
+ MetadataInfo(name="PROPERTYSQFT", type="float", description="Square footage of the property"),
214
+ MetadataInfo(name="ADDRESS", type="str", description="Full address of the house"),
215
+ MetadataInfo(name="STATE", type="str", description="State of the house"),
216
+ MetadataInfo(name="MAIN_ADDRESS", type="str", description="Main address information"),
217
+ MetadataInfo(name="ADMINISTRATIVE_AREA_LEVEL_2", type="str", description="Administrative area level 2 information"),
218
+ MetadataInfo(name="LOCALITY", type="str", description="Locality information"),
219
+ MetadataInfo(name="SUBLOCALITY", type="str", description="Sublocality information"),
220
+ MetadataInfo(name="STREET_NAME", type="str", description="Street name"),
221
+ MetadataInfo(name="LONG_NAME", type="str", description="Long name of the house"),
222
+ MetadataInfo(name="FORMATTED_ADDRESS", type="str", description="Formatted address"),
223
+ MetadataInfo(name="LATITUDE", type="float", description="Latitude coordinate of the house"),
224
+ MetadataInfo(name="LONGITUDE", type="float", description="Longitude coordinate of the house"),
225
+ ],
226
+ )
227
+ vector_auto_retriever = VectorIndexAutoRetriever(
228
+ index, vector_store_info=vector_store_info
229
+ )
230
+
231
+ retriever_query_engine = RetrieverQueryEngine.from_args(
232
+ vector_auto_retriever, service_context=service_context
233
+ )
234
+
235
+ sql_tool = QueryEngineTool.from_defaults(
236
+ query_engine=sql_query_engine,
237
+ description=(
238
+ "Useful for translating a natural language query into a SQL query over"
239
+ " a table 'houses', containing prices of New York houses, providing valuable insights into the real estate market in the region. It includes information such as broker titles, house types, prices, number of bedrooms and bathrooms, property square footage, addresses, state, administrative and local areas, street names, and geographical coordinates."
240
+
241
+ ),
242
+ )
243
+ vector_tool = QueryEngineTool.from_defaults(
244
+ query_engine=retriever_query_engine,
245
+ description=(
246
+ f"Useful for answering questions about different housing listings in New York. Use this to refine your answers"
247
+ ),
248
+ )
249
+
250
+ query_engine = SQLAutoVectorQueryEngine(
251
+ sql_tool, vector_tool, service_context=service_context
252
+ )
253
+ response = query_engine.query(f"{user_query}+. Provide a detailed response and include lONG_NAME, name of broker, number of beds, number of baths, propertysqft and FORMATTED_ADDRESS. ALWAYS USE LIKE in WHERE CLAUSE. ALWAYS RESPOND IN WELL FORMATTED MARKDOWN")
254
+ st.markdown(response.response)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit-pills
2
+ streamlit
3
+ SQLAlchemy
4
+ openai
5
+ llama-index
6
+ google-generativeai
7
+ transformers
8
+ torch