Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- views/define_query.py +66 -0
- views/extract_information.py +100 -0
- views/home.py +54 -0
- views/upload_data.py +54 -0
- views/view_and_download.py +81 -0
views/define_query.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def CreatePage():
|
4 |
+
st.header("Define Your Custom Query")
|
5 |
+
|
6 |
+
if "data" not in st.session_state or st.session_state["data"] is None:
|
7 |
+
st.warning("Please upload data first! Use the 'Upload Data' section to upload your data.")
|
8 |
+
else:
|
9 |
+
column = st.selectbox(
|
10 |
+
"Select entity column",
|
11 |
+
st.session_state["data"].columns,
|
12 |
+
help="Select the column that contains the entities for which you want to define queries."
|
13 |
+
)
|
14 |
+
|
15 |
+
st.markdown("""
|
16 |
+
<style>
|
17 |
+
div[data-baseweb="select"] div[data-id="select"] {{
|
18 |
+
background-color: #f0f8ff;
|
19 |
+
}}
|
20 |
+
</style>
|
21 |
+
""", unsafe_allow_html=True)
|
22 |
+
|
23 |
+
st.subheader("Define Fields to Extract")
|
24 |
+
num_fields = st.number_input(
|
25 |
+
"Number of fields to extract",
|
26 |
+
min_value=1,
|
27 |
+
value=1,
|
28 |
+
step=1,
|
29 |
+
help="Specify how many fields you want to extract from each entity."
|
30 |
+
)
|
31 |
+
|
32 |
+
fields = []
|
33 |
+
for i in range(num_fields):
|
34 |
+
field = st.text_input(
|
35 |
+
f"Field {i+1} name",
|
36 |
+
key=f"field_{i}",
|
37 |
+
placeholder=f"Enter field name for {i+1}",
|
38 |
+
help="Name the field you want to extract from the entity."
|
39 |
+
)
|
40 |
+
if field:
|
41 |
+
fields.append(field)
|
42 |
+
|
43 |
+
if fields:
|
44 |
+
st.subheader("Query Template")
|
45 |
+
query_template = st.text_area(
|
46 |
+
"Enter query template (Use '{entity}' to represent each entity)",
|
47 |
+
value=f"Find the {', '.join(fields)} for {{entity}}",
|
48 |
+
help="You can use {entity} as a placeholder to represent each entity in the query."
|
49 |
+
)
|
50 |
+
|
51 |
+
if "{entity}" in query_template:
|
52 |
+
example_entity = str(st.session_state["data"][column].iloc[0])
|
53 |
+
example_query = query_template.replace("{entity}", example_entity)
|
54 |
+
st.write("### Example Query Preview")
|
55 |
+
st.code(example_query)
|
56 |
+
|
57 |
+
if st.button("Save Query Configuration"):
|
58 |
+
if not fields:
|
59 |
+
st.error("Please define at least one field to extract.")
|
60 |
+
elif not query_template:
|
61 |
+
st.error("Please enter a query template.")
|
62 |
+
else:
|
63 |
+
st.session_state["column_selection"] = column
|
64 |
+
st.session_state["query_template"] = query_template
|
65 |
+
st.session_state["extraction_fields"] = fields
|
66 |
+
st.success("Query configuration saved successfully!")
|
views/extract_information.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from funcs.llm import LLM
|
3 |
+
class ExtractInformation:
|
4 |
+
def __init__(self,llm :LLM):
|
5 |
+
self.llm = llm
|
6 |
+
|
7 |
+
def CreatePage(self):
|
8 |
+
st.header("Extract Information")
|
9 |
+
if "query_template" in st.session_state and "data" in st.session_state:
|
10 |
+
st.write("### Using Query Template:")
|
11 |
+
st.code(st.session_state["query_template"])
|
12 |
+
|
13 |
+
column_selection = st.session_state["column_selection"]
|
14 |
+
entities_column = st.session_state["data"][column_selection]
|
15 |
+
|
16 |
+
col1, col2 = st.columns([2, 1])
|
17 |
+
with col1:
|
18 |
+
st.write("### Selected Entity Column:")
|
19 |
+
st.dataframe(entities_column, use_container_width=True)
|
20 |
+
|
21 |
+
with col2:
|
22 |
+
start_button = st.button("Start Extraction", type="primary", use_container_width=True)
|
23 |
+
|
24 |
+
results_container = st.empty()
|
25 |
+
|
26 |
+
if start_button:
|
27 |
+
with st.spinner("Extracting information..."):
|
28 |
+
progress_bar = st.progress(0)
|
29 |
+
progress_text = st.empty()
|
30 |
+
|
31 |
+
try:
|
32 |
+
results = []
|
33 |
+
for i, selected_entity in enumerate(entities_column):
|
34 |
+
user_query = st.session_state["query_template"].replace("{entity}", str(selected_entity))
|
35 |
+
final_answer, search_results = self.llm.refine_answer_with_searches(selected_entity, user_query)
|
36 |
+
results.append({
|
37 |
+
"Entity": selected_entity,
|
38 |
+
"Extracted Information": final_answer,
|
39 |
+
"Search Results": search_results
|
40 |
+
})
|
41 |
+
|
42 |
+
progress = (i + 1) / len(entities_column)
|
43 |
+
progress_bar.progress(progress)
|
44 |
+
progress_text.text(f"Processing {i+1}/{len(entities_column)} entities...")
|
45 |
+
|
46 |
+
st.session_state["results"] = results
|
47 |
+
|
48 |
+
progress_bar.empty()
|
49 |
+
progress_text.empty()
|
50 |
+
st.success("Extraction completed successfully!")
|
51 |
+
|
52 |
+
except Exception as e:
|
53 |
+
st.error(f"An error occurred during extraction: {str(e)}")
|
54 |
+
st.session_state.pop("results", None)
|
55 |
+
|
56 |
+
if "results" in st.session_state and st.session_state["results"]:
|
57 |
+
with results_container:
|
58 |
+
results = st.session_state["results"]
|
59 |
+
|
60 |
+
search_query = st.text_input("🔍 Search results", "")
|
61 |
+
|
62 |
+
tab1, tab2 = st.tabs(["Compact View", "Detailed View"])
|
63 |
+
|
64 |
+
with tab1:
|
65 |
+
found_results = False
|
66 |
+
for result in results:
|
67 |
+
if search_query.lower() in str(result["Entity"]).lower() or \
|
68 |
+
search_query.lower() in str(result["Extracted Information"]).lower():
|
69 |
+
found_results = True
|
70 |
+
with st.expander(f"📋 {result['Entity']}", expanded=False):
|
71 |
+
st.markdown("#### Extracted Information")
|
72 |
+
st.write(result["Extracted Information"])
|
73 |
+
|
74 |
+
if not found_results and search_query:
|
75 |
+
st.info("No results found for your search.")
|
76 |
+
|
77 |
+
with tab2:
|
78 |
+
found_results = False
|
79 |
+
for i, result in enumerate(results):
|
80 |
+
if search_query.lower() in str(result["Entity"]).lower() or \
|
81 |
+
search_query.lower() in str(result["Extracted Information"]).lower():
|
82 |
+
found_results = True
|
83 |
+
st.markdown(f"### Entity {i+1}: {result['Entity']}")
|
84 |
+
|
85 |
+
col1, col2 = st.columns(2)
|
86 |
+
|
87 |
+
with col1:
|
88 |
+
st.markdown("#### 📝 Extracted Information")
|
89 |
+
st.info(result["Extracted Information"])
|
90 |
+
|
91 |
+
with col2:
|
92 |
+
st.markdown("#### 🔍 Search Results")
|
93 |
+
st.warning(result["Search Results"])
|
94 |
+
|
95 |
+
st.divider()
|
96 |
+
|
97 |
+
if not found_results and search_query:
|
98 |
+
st.info("No results found for your search.")
|
99 |
+
else:
|
100 |
+
st.warning("Please upload your data and define the query template.")
|
views/home.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
def CreatePage():
|
3 |
+
st.markdown("""
|
4 |
+
<h1 style="text-align:center; color:#4CAF50; font-size: 40px;">🚀 Welcome to DataScribe</h1>
|
5 |
+
<p style="text-align:center; font-size: 18px; color:#333;">An AI-powered information extraction tool to streamline data retrieval and analysis.</p>
|
6 |
+
""", unsafe_allow_html=True)
|
7 |
+
|
8 |
+
st.markdown("""---""")
|
9 |
+
|
10 |
+
def feature_card(title, description, icon, page):
|
11 |
+
col1, col2 = st.columns([1, 4])
|
12 |
+
with col1:
|
13 |
+
st.markdown(f"<div style='font-size: 40px; text-align:center;'>{icon}</div>", unsafe_allow_html=True)
|
14 |
+
with col2:
|
15 |
+
if st.button(f"{title}", key=title, help=description):
|
16 |
+
st.session_state.selected_page = page
|
17 |
+
st.markdown(f"<p style='font-size: 14px; color:#555;'>{description}</p>", unsafe_allow_html=True)
|
18 |
+
|
19 |
+
col1, col2 = st.columns([1, 1])
|
20 |
+
|
21 |
+
with col1:
|
22 |
+
feature_card(
|
23 |
+
title="Upload Data",
|
24 |
+
description="Upload data from CSV or Google Sheets to get started with your extraction.",
|
25 |
+
icon="📄",
|
26 |
+
page="Upload Data"
|
27 |
+
)
|
28 |
+
|
29 |
+
with col2:
|
30 |
+
feature_card(
|
31 |
+
title="Define Custom Queries",
|
32 |
+
description="Set custom search queries for each entity in your dataset for specific information retrieval.",
|
33 |
+
icon="🔍",
|
34 |
+
page="Define Query"
|
35 |
+
)
|
36 |
+
|
37 |
+
col1, col2 = st.columns([1, 1])
|
38 |
+
|
39 |
+
with col1:
|
40 |
+
feature_card(
|
41 |
+
title="Run Automated Searches",
|
42 |
+
description="Execute automated web searches and extract relevant information using an AI-powered agent.",
|
43 |
+
icon="🤖",
|
44 |
+
page="Extract Information"
|
45 |
+
)
|
46 |
+
|
47 |
+
with col2:
|
48 |
+
feature_card(
|
49 |
+
title="View & Download Results",
|
50 |
+
description="View extracted data in a structured format and download as a CSV or update Google Sheets.",
|
51 |
+
icon="📊",
|
52 |
+
page="View & Download"
|
53 |
+
)
|
54 |
+
return True
|
views/upload_data.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from funcs.googlesheet import get_google_sheet_data
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
def CreatePage():
|
6 |
+
|
7 |
+
st.header("Upload or Connect Your Data")
|
8 |
+
data_source = st.radio("Choose data source:", ["CSV Files", "Google Sheets"])
|
9 |
+
|
10 |
+
if data_source == "CSV Files":
|
11 |
+
if "data" in st.session_state:
|
12 |
+
st.success("Data uploaded successfully! Here is a preview:")
|
13 |
+
st.dataframe(st.session_state["data"].head(10)) # Display only the first 10 rows for a cleaner view
|
14 |
+
else:
|
15 |
+
uploaded_files = st.file_uploader("Upload your CSV files", type=["csv"], accept_multiple_files=True)
|
16 |
+
|
17 |
+
if uploaded_files is not None:
|
18 |
+
dfs = []
|
19 |
+
for uploaded_file in uploaded_files:
|
20 |
+
try:
|
21 |
+
df = pd.read_csv(uploaded_file)
|
22 |
+
dfs.append(df)
|
23 |
+
except Exception as e:
|
24 |
+
st.error(f"Error reading file {uploaded_file.name}: {e}")
|
25 |
+
|
26 |
+
if dfs:
|
27 |
+
full_data = pd.concat(dfs, ignore_index=True)
|
28 |
+
st.session_state["data"] = full_data
|
29 |
+
st.success("Data uploaded successfully! Here is a preview:")
|
30 |
+
st.dataframe(full_data.head(10)) # Show preview of first 10 rows
|
31 |
+
else:
|
32 |
+
st.warning("No valid data found in the uploaded files.")
|
33 |
+
|
34 |
+
if st.button("Clear Data"):
|
35 |
+
del st.session_state["data"]
|
36 |
+
st.success("Data has been cleared!")
|
37 |
+
|
38 |
+
elif data_source == "Google Sheets":
|
39 |
+
sheet_id = st.text_input("Enter Google Sheet ID")
|
40 |
+
range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
|
41 |
+
|
42 |
+
if sheet_id and range_name:
|
43 |
+
if st.button("Fetch Data"):
|
44 |
+
with st.spinner("Fetching data from Google Sheets..."):
|
45 |
+
try:
|
46 |
+
data = get_google_sheet_data(sheet_id, range_name)
|
47 |
+
st.session_state["data"] = data
|
48 |
+
st.success("Data fetched successfully! Here is a preview:")
|
49 |
+
st.dataframe(data.head(10)) # Show preview of first 10 rows
|
50 |
+
except Exception as e:
|
51 |
+
st.error(f"Error fetching data: {e}")
|
52 |
+
else:
|
53 |
+
st.warning("Please enter both Sheet ID and Range name before fetching data.")
|
54 |
+
|
views/view_and_download.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import gspread
|
3 |
+
from google.oauth2.service_account import Credentials
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
def CreatePage():
|
7 |
+
st.header("View & Download Results")
|
8 |
+
|
9 |
+
if "results" in st.session_state and st.session_state["results"]:
|
10 |
+
results_df = pd.DataFrame(st.session_state["results"])
|
11 |
+
st.write("### Results Preview")
|
12 |
+
|
13 |
+
# Display the results preview
|
14 |
+
if "Extracted Information" in results_df.columns and "Search Results" in results_df.columns:
|
15 |
+
st.dataframe(results_df.style.map(lambda val: 'background-color: #d3f4ff' if isinstance(val, str) else '', subset=["Extracted Information", "Search Results"]))
|
16 |
+
else:
|
17 |
+
st.warning("Required columns are missing in results data.")
|
18 |
+
|
19 |
+
# Download options
|
20 |
+
download_option = st.selectbox(
|
21 |
+
"Select data to download:",
|
22 |
+
["All Results", "Extracted Information", "Web Results"]
|
23 |
+
)
|
24 |
+
|
25 |
+
if download_option == "All Results":
|
26 |
+
data_to_download = results_df
|
27 |
+
elif download_option == "Extracted Information":
|
28 |
+
data_to_download = results_df[["Entity", "Extracted Information"]]
|
29 |
+
elif download_option == "Web Results":
|
30 |
+
data_to_download = results_df[["Entity", "Search Results"]]
|
31 |
+
|
32 |
+
st.download_button(
|
33 |
+
label=f"Download {download_option} as CSV",
|
34 |
+
data=data_to_download.to_csv(index=False),
|
35 |
+
file_name=f"{download_option.lower().replace(' ', '_')}.csv",
|
36 |
+
mime="text/csv"
|
37 |
+
)
|
38 |
+
|
39 |
+
# Option to update Google Sheets
|
40 |
+
update_option = st.selectbox(
|
41 |
+
"Do you want to update Google Sheets?",
|
42 |
+
["No", "Yes"]
|
43 |
+
)
|
44 |
+
|
45 |
+
if update_option == "Yes":
|
46 |
+
if 'sheet_id' not in st.session_state:
|
47 |
+
st.session_state.sheet_id = ''
|
48 |
+
if 'range_name' not in st.session_state:
|
49 |
+
st.session_state.range_name = ''
|
50 |
+
|
51 |
+
# Input fields for Google Sheets ID and Range
|
52 |
+
sheet_id = st.text_input("Enter Google Sheet ID", value=st.session_state.sheet_id)
|
53 |
+
range_name = st.text_input("Enter Range (e.g., 'Sheet1!A1')", value=st.session_state.range_name)
|
54 |
+
|
55 |
+
if sheet_id and range_name:
|
56 |
+
st.session_state.sheet_id = sheet_id
|
57 |
+
st.session_state.range_name = range_name
|
58 |
+
|
59 |
+
# Prepare data for update
|
60 |
+
data_to_update = [results_df.columns.tolist()] + results_df.values.tolist()
|
61 |
+
|
62 |
+
# Update Google Sheets button
|
63 |
+
if st.button("Update Google Sheet"):
|
64 |
+
try:
|
65 |
+
if '!' not in range_name:
|
66 |
+
st.error("Invalid range format. Please use the format 'SheetName!Range'.")
|
67 |
+
else:
|
68 |
+
sheet_name, cell_range = range_name.split('!', 1)
|
69 |
+
scopes = ["https://www.googleapis.com/auth/spreadsheets"]
|
70 |
+
creds = Credentials.from_service_account_file("/Users/sam22ridhi/Desktop/data/DataScribe/credentials/credentials.json", scopes=scopes)
|
71 |
+
client = gspread.authorize(creds)
|
72 |
+
sheet = client.open_by_key(sheet_id).worksheet(sheet_name)
|
73 |
+
sheet.clear()
|
74 |
+
sheet.update(f"{cell_range}", data_to_update)
|
75 |
+
st.success("Data updated in the Google Sheet!")
|
76 |
+
except Exception as e:
|
77 |
+
st.error(f"Error updating Google Sheet: {e}")
|
78 |
+
else:
|
79 |
+
st.warning("Please enter both the Sheet ID and Range name before updating.")
|
80 |
+
else:
|
81 |
+
st.warning("No results available to view. Please run the extraction process.")
|