Spaces:

samiee2213
/

DataScribe

Sleeping

App Files Files Community

samiee2213 commited on Nov 18, 2024

Commit

a207b64

verified ·

1 Parent(s): 2571ddf

Upload 5 files

Browse files

Files changed (5) hide show

views/define_query.py +66 -0
views/extract_information.py +100 -0
views/home.py +54 -0
views/upload_data.py +54 -0
views/view_and_download.py +81 -0

views/define_query.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import streamlit as st
+def CreatePage():
+    st.header("Define Your Custom Query")
+    if "data" not in st.session_state or st.session_state["data"] is None:
+        st.warning("Please upload data first! Use the 'Upload Data' section to upload your data.")
+    else:
+        column = st.selectbox(
+            "Select entity column",
+            st.session_state["data"].columns,
+            help="Select the column that contains the entities for which you want to define queries."
+        )
+        st.markdown("""
+        <style>
+        div[data-baseweb="select"] div[data-id="select"] {{
+            background-color: #f0f8ff;
+        }}
+        </style>
+        """, unsafe_allow_html=True)
+        st.subheader("Define Fields to Extract")
+        num_fields = st.number_input(
+            "Number of fields to extract",
+            min_value=1,
+            value=1,
+            step=1,
+            help="Specify how many fields you want to extract from each entity."
+        )
+        fields = []
+        for i in range(num_fields):
+            field = st.text_input(
+                f"Field {i+1} name",
+                key=f"field_{i}",
+                placeholder=f"Enter field name for {i+1}",
+                help="Name the field you want to extract from the entity."
+            )
+            if field:
+                fields.append(field)
+        if fields:
+            st.subheader("Query Template")
+            query_template = st.text_area(
+                "Enter query template (Use '{entity}' to represent each entity)",
+                value=f"Find the {', '.join(fields)} for {{entity}}",
+                help="You can use {entity} as a placeholder to represent each entity in the query."
+            )
+            if "{entity}" in query_template:
+                example_entity = str(st.session_state["data"][column].iloc[0])
+                example_query = query_template.replace("{entity}", example_entity)
+                st.write("### Example Query Preview")
+                st.code(example_query)
+            if st.button("Save Query Configuration"):
+                if not fields:
+                    st.error("Please define at least one field to extract.")
+                elif not query_template:
+                    st.error("Please enter a query template.")
+                else:
+                    st.session_state["column_selection"] = column
+                    st.session_state["query_template"] = query_template
+                    st.session_state["extraction_fields"] = fields
+                    st.success("Query configuration saved successfully!")

views/extract_information.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import streamlit as st
+from funcs.llm import LLM
+class ExtractInformation:
+    def __init__(self,llm :LLM):
+        self.llm = llm
+    def CreatePage(self):
+        st.header("Extract Information")
+        if "query_template" in st.session_state and "data" in st.session_state:
+            st.write("### Using Query Template:")
+            st.code(st.session_state["query_template"])
+            column_selection = st.session_state["column_selection"]
+            entities_column = st.session_state["data"][column_selection]
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                st.write("### Selected Entity Column:")
+                st.dataframe(entities_column, use_container_width=True)
+            with col2:
+                start_button = st.button("Start Extraction", type="primary", use_container_width=True)
+            results_container = st.empty()
+            if start_button:
+                with st.spinner("Extracting information..."):
+                    progress_bar = st.progress(0)
+                    progress_text = st.empty()
+                    try:
+                        results = []
+                        for i, selected_entity in enumerate(entities_column):
+                            user_query = st.session_state["query_template"].replace("{entity}", str(selected_entity))
+                            final_answer, search_results = self.llm.refine_answer_with_searches(selected_entity, user_query)
+                            results.append({
+                                "Entity": selected_entity,
+                                "Extracted Information": final_answer,
+                                "Search Results": search_results
+                            })
+                            progress = (i + 1) / len(entities_column)
+                            progress_bar.progress(progress)
+                            progress_text.text(f"Processing {i+1}/{len(entities_column)} entities...")
+                        st.session_state["results"] = results
+                        progress_bar.empty()
+                        progress_text.empty()
+                        st.success("Extraction completed successfully!")
+                    except Exception as e:
+                        st.error(f"An error occurred during extraction: {str(e)}")
+                        st.session_state.pop("results", None)
+            if "results" in st.session_state and st.session_state["results"]:
+                with results_container:
+                    results = st.session_state["results"]
+                    search_query = st.text_input("🔍 Search results", "")
+                    tab1, tab2 = st.tabs(["Compact View", "Detailed View"])
+                    with tab1:
+                        found_results = False
+                        for result in results:
+                            if search_query.lower() in str(result["Entity"]).lower() or \
+                            search_query.lower() in str(result["Extracted Information"]).lower():
+                                found_results = True
+                                with st.expander(f"📋 {result['Entity']}", expanded=False):
+                                    st.markdown("#### Extracted Information")
+                                    st.write(result["Extracted Information"])
+                        if not found_results and search_query:
+                            st.info("No results found for your search.")
+                    with tab2:
+                        found_results = False
+                        for i, result in enumerate(results):
+                            if search_query.lower() in str(result["Entity"]).lower() or \
+                            search_query.lower() in str(result["Extracted Information"]).lower():
+                                found_results = True
+                                st.markdown(f"### Entity {i+1}: {result['Entity']}")
+                                col1, col2 = st.columns(2)
+                                with col1:
+                                    st.markdown("#### 📝 Extracted Information")
+                                    st.info(result["Extracted Information"])
+                                with col2:
+                                    st.markdown("#### 🔍 Search Results")
+                                    st.warning(result["Search Results"])
+                                st.divider()
+                        if not found_results and search_query:
+                            st.info("No results found for your search.")
+        else:
+            st.warning("Please upload your data and define the query template.")

views/home.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import streamlit as st
+def CreatePage():
+    st.markdown("""
+        <h1 style="text-align:center; color:#4CAF50; font-size: 40px;">🚀 Welcome to DataScribe</h1>
+        <p style="text-align:center; font-size: 18px; color:#333;">An AI-powered information extraction tool to streamline data retrieval and analysis.</p>
+    """, unsafe_allow_html=True)
+    st.markdown("""---""")
+    def feature_card(title, description, icon, page):
+        col1, col2 = st.columns([1, 4])
+        with col1:
+            st.markdown(f"<div style='font-size: 40px; text-align:center;'>{icon}</div>", unsafe_allow_html=True)
+        with col2:
+            if st.button(f"{title}", key=title, help=description):
+                st.session_state.selected_page = page
+            st.markdown(f"<p style='font-size: 14px; color:#555;'>{description}</p>", unsafe_allow_html=True)
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        feature_card(
+            title="Upload Data",
+            description="Upload data from CSV or Google Sheets to get started with your extraction.",
+            icon="📄",
+            page="Upload Data"
+        )
+    with col2:
+        feature_card(
+            title="Define Custom Queries",
+            description="Set custom search queries for each entity in your dataset for specific information retrieval.",
+            icon="🔍",
+            page="Define Query"
+        )
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        feature_card(
+            title="Run Automated Searches",
+            description="Execute automated web searches and extract relevant information using an AI-powered agent.",
+            icon="🤖",
+            page="Extract Information"
+        )
+    with col2:
+        feature_card(
+            title="View & Download Results",
+            description="View extracted data in a structured format and download as a CSV or update Google Sheets.",
+            icon="📊",
+            page="View & Download"
+        )
+    return True

views/upload_data.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import streamlit as st
+from funcs.googlesheet import get_google_sheet_data
+import pandas as pd
+def CreatePage():
+    st.header("Upload or Connect Your Data")
+    data_source = st.radio("Choose data source:", ["CSV Files", "Google Sheets"])
+    if data_source == "CSV Files":
+        if "data" in st.session_state:
+            st.success("Data uploaded successfully! Here is a preview:")
+            st.dataframe(st.session_state["data"].head(10))  # Display only the first 10 rows for a cleaner view
+        else:
+            uploaded_files = st.file_uploader("Upload your CSV files", type=["csv"], accept_multiple_files=True)
+            if uploaded_files is not None:
+                dfs = []
+                for uploaded_file in uploaded_files:
+                    try:
+                        df = pd.read_csv(uploaded_file)
+                        dfs.append(df)
+                    except Exception as e:
+                        st.error(f"Error reading file {uploaded_file.name}: {e}")
+                if dfs:
+                    full_data = pd.concat(dfs, ignore_index=True)
+                    st.session_state["data"] = full_data
+                    st.success("Data uploaded successfully! Here is a preview:")
+                    st.dataframe(full_data.head(10))  # Show preview of first 10 rows
+                else:
+                    st.warning("No valid data found in the uploaded files.")
+            if st.button("Clear Data"):
+                del st.session_state["data"]
+                st.success("Data has been cleared!")
+    elif data_source == "Google Sheets":
+        sheet_id = st.text_input("Enter Google Sheet ID")
+        range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
+        if sheet_id and range_name:
+            if st.button("Fetch Data"):
+                with st.spinner("Fetching data from Google Sheets..."):
+                    try:
+                        data = get_google_sheet_data(sheet_id, range_name)
+                        st.session_state["data"] = data
+                        st.success("Data fetched successfully! Here is a preview:")
+                        st.dataframe(data.head(10))  # Show preview of first 10 rows
+                    except Exception as e:
+                        st.error(f"Error fetching data: {e}")
+        else:
+            st.warning("Please enter both Sheet ID and Range name before fetching data.")

views/view_and_download.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import streamlit as st
+import gspread
+from google.oauth2.service_account import Credentials
+import pandas as pd
+def CreatePage():
+    st.header("View & Download Results")
+    if "results" in st.session_state and st.session_state["results"]:
+        results_df = pd.DataFrame(st.session_state["results"])
+        st.write("### Results Preview")
+        # Display the results preview
+        if "Extracted Information" in results_df.columns and "Search Results" in results_df.columns:
+            st.dataframe(results_df.style.map(lambda val: 'background-color: #d3f4ff' if isinstance(val, str) else '', subset=["Extracted Information", "Search Results"]))
+        else:
+            st.warning("Required columns are missing in results data.")
+        # Download options
+        download_option = st.selectbox(
+            "Select data to download:",
+            ["All Results", "Extracted Information", "Web Results"]
+        )
+        if download_option == "All Results":
+            data_to_download = results_df
+        elif download_option == "Extracted Information":
+            data_to_download = results_df[["Entity", "Extracted Information"]]
+        elif download_option == "Web Results":
+            data_to_download = results_df[["Entity", "Search Results"]]
+        st.download_button(
+            label=f"Download {download_option} as CSV",
+            data=data_to_download.to_csv(index=False),
+            file_name=f"{download_option.lower().replace(' ', '_')}.csv",
+            mime="text/csv"
+        )
+        # Option to update Google Sheets
+        update_option = st.selectbox(
+            "Do you want to update Google Sheets?",
+            ["No", "Yes"]
+        )
+        if update_option == "Yes":
+            if 'sheet_id' not in st.session_state:
+                st.session_state.sheet_id = ''
+            if 'range_name' not in st.session_state:
+                st.session_state.range_name = ''
+            # Input fields for Google Sheets ID and Range
+            sheet_id = st.text_input("Enter Google Sheet ID", value=st.session_state.sheet_id)
+            range_name = st.text_input("Enter Range (e.g., 'Sheet1!A1')", value=st.session_state.range_name)
+            if sheet_id and range_name:
+                st.session_state.sheet_id = sheet_id
+                st.session_state.range_name = range_name
+                # Prepare data for update
+                data_to_update = [results_df.columns.tolist()] + results_df.values.tolist()
+                # Update Google Sheets button
+                if st.button("Update Google Sheet"):
+                    try:
+                        if '!' not in range_name:
+                            st.error("Invalid range format. Please use the format 'SheetName!Range'.")
+                        else:
+                            sheet_name, cell_range = range_name.split('!', 1)
+                            scopes = ["https://www.googleapis.com/auth/spreadsheets"]
+                            creds = Credentials.from_service_account_file("/Users/sam22ridhi/Desktop/data/DataScribe/credentials/credentials.json", scopes=scopes)
+                            client = gspread.authorize(creds)
+                            sheet = client.open_by_key(sheet_id).worksheet(sheet_name)
+                            sheet.clear()
+                            sheet.update(f"{cell_range}", data_to_update)
+                            st.success("Data updated in the Google Sheet!")
+                    except Exception as e:
+                        st.error(f"Error updating Google Sheet: {e}")
+            else:
+                st.warning("Please enter both the Sheet ID and Range name before updating.")
+    else:
+        st.warning("No results available to view. Please run the extraction process.")