import weaviate import streamlit as st from weaviate.embedded import EmbeddedOptions from weaviate import Client import pandas as pd # <-- Add this import from io import StringIO # <-- Add this import import pandas as pd def hybrid_search_weaviate(client, selected_class, query): """ Perform a hybrid search on Weaviate using the provided class and query. Return the results as a list of dictionaries. """ # Construct the search query search_query = { "where": { "path": ["*"], "operator": "Like", "valueString": query } } # Execute the query and retrieve the results results = client.query.get(selected_class, "*").with_where(search_query).do() # Extract the data objects from the results data_objects = results.get('data', {}).get('Get', {}).get('Things', []) return data_objects def convert_to_tapas_format(data): """ Convert the list of dictionaries (from Weaviate) into the format TAPAS expects. Return the table as a list of lists. """ # Extract the data objects from the results data_objects = data.get('data', {}).get('Get', {}).get('Things', []) # Convert the data objects into a DataFrame df = pd.DataFrame([obj['thing'] for obj in data_objects]) table = [df.columns.tolist()] + df.values.tolist() return table def initialize_weaviate_client(): return weaviate.Client(embedded_options=EmbeddedOptions()) def class_exists(client, class_name): try: client.schema.get_class(class_name) return True except: return False def map_dtype_to_weaviate(dtype): if "int" in str(dtype): return "int" elif "float" in str(dtype): return "number" elif "bool" in str(dtype): return "boolean" else: return "string" def create_new_class_schema(client, class_name, class_description): class_schema = { "class": class_name, "description": class_description, "properties": [] } try: client.schema.create({"classes": [class_schema]}) st.success(f"Class {class_name} created successfully!") except Exception as e: st.error(f"Error creating class: {e}") def ingest_data_to_weaviate(client, csv_file, selected_class): # Read the CSV data data = csv_file.read().decode("utf-8") dataframe = pd.read_csv(StringIO(data)) # After converting the CSV to a dataframe embedded_table = tapas_utils.embed_table(dataframe) # Create a unique ID for the table (for example, based on its content) table_id = hashlib.md5(dataframe.to_csv(index=False).encode()).hexdigest() # Store the embedded table in Weaviate client.data_object.create({ "id": table_id, "embeddedTable": embedded_table.tolist(), "content": dataframe.to_csv(index=False) }, selected_class) # Fetch the schema for the selected class class_schema = get_class_schema(client, selected_class) # If the schema is empty, create it based on the CSV columns if not class_schema or not class_schema["properties"]: for column_name, data_type in zip(dataframe.columns, dataframe.dtypes): property_schema = { "name": column_name, "description": f"Property for {column_name}", "dataType": [map_dtype_to_weaviate(data_type)] } try: client.schema.property.create(selected_class, property_schema) except weaviate.exceptions.SchemaValidationException: # Property might already exist, so we can continue pass else: # If the schema is not empty, compare it with the CSV columns schema_columns = [prop["name"] for prop in class_schema["properties"]] if set(dataframe.columns) != set(schema_columns): st.error("The columns in the uploaded CSV do not match the schema of the selected class. Please check and upload the correct CSV or create a new class.") return # Ingest the data into Weaviate data = dataframe.to_dict(orient="records") for record in data: try: client.data_object.create(record, selected_class) except Exception as e: st.error(f"Error ingesting record: {e}") # Display a preview of the ingested data st.write(f"Your CSV was successfully integrated into the vector database under the class '{selected_class}'") st.write(dataframe.head()) # Display the first few rows of the dataframe as a preview # Return the dataframe for preview return dataframe # Added this line def get_class_schema(client, class_name): try: schema = client.schema.get() for cls in schema["classes"]: if cls["class"] == class_name: return cls return None except weaviate.exceptions.SchemaValidationException: return None def retrieve_relevant_table(client, selected_class, question_embedding): # Query Weaviate to get the most relevant table results = client.query.get(selected_class, ["content"]).with_near_text(question_embedding).do() # Extract the table content from the results table_content = results.get('data', {}).get('Get', {}).get('Things', [])[0].get('content') # Convert the table content to a DataFrame table = pd.read_csv(StringIO(table_content)) return table