Spaces:

somewheresystems
/

dataclysm

Paused

App Files Files Community

somewheresy commited on Jan 21, 2024

Commit

21bee4f

verified ·

1 Parent(s): a3225ba

Upload 4 files

Browse files

Files changed (4) hide show

README.md +0 -13
app.log +0 -0
app.py +275 -0
requirements.txt +308 -0

README.md CHANGED Viewed

@@ -1,13 +0,0 @@
----
-title: Dataclysm
-emoji: 🐠
-colorFrom: purple
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.30.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.log ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Import necessary libraries
+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.manifold import TSNE
+from datasets import load_dataset, Dataset
+from sklearn.cluster import KMeans
+import plotly.graph_objects as go
+import time
+import logging
+# Additional libraries for querying
+from FlagEmbedding import FlagModel
+# Global variables and dataset loading
+global dataset_name
+dataset_name = 'somewheresystems/dataclysm-arxiv'
+st.session_state.dataclysm_arxiv = load_dataset(dataset_name, split="train")
+total_samples = len(st.session_state.dataclysm_arxiv)
+logging.basicConfig(filename='app.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+# Load the dataset once at the start
+# Initialize the model for querying
+model = FlagModel('BAAI/bge-small-en-v1.5', query_instruction_for_retrieval="Represent this sentence for searching relevant passages:", use_fp16=True)
+def load_data(num_samples):
+    start_time = time.time()
+    dataset_name = 'somewheresystems/dataclysm-arxiv'
+    # Load the dataset
+    logging.info(f'Loading dataset...')
+    dataset = load_dataset(dataset_name)
+    total_samples = len(dataset['train'])
+    logging.info('Converting to pandas dataframe...')
+    # Convert the dataset to a pandas DataFrame
+    df = dataset['train'].to_pandas()
+    # Adjust num_samples if it's more than the total number of samples
+    num_samples = min(num_samples, total_samples)
+    st.sidebar.text(f'Number of samples: {num_samples} ({num_samples / total_samples:.2%} of total)')
+    # Randomly sample the dataframe
+    df = df.sample(n=num_samples)
+    # Assuming 'embeddings' column contains the embeddings
+    embeddings = df['title_embedding'].tolist()
+    print("embeddings length: " + str(len(embeddings)))
+    # Convert list of lists to numpy array
+    embeddings = np.array(embeddings, dtype=object)
+    end_time = time.time()  # End timing
+    st.sidebar.text(f'Data loading completed in {end_time - start_time:.3f} seconds')
+    return df, embeddings
+def perform_tsne(embeddings):
+    start_time = time.time()
+    logging.info('Performing t-SNE...')
+    n_samples = len(embeddings)
+    perplexity = min(30, n_samples - 1) if n_samples > 1 else 1
+    # Check if all embeddings have the same length
+    if len(set([len(embed) for embed in embeddings])) > 1:
+        raise ValueError("All embeddings should have the same length")
+    # Dimensionality Reduction with t-SNE
+    tsne = TSNE(n_components=3, perplexity=perplexity, n_iter=300)
+    # Create a placeholder for progress bar
+    progress_text = st.empty()
+    progress_text.text("t-SNE in progress...")
+    tsne_results = tsne.fit_transform(np.vstack(embeddings.tolist()))
+    # Update progress bar to indicate completion
+    progress_text.text(f"t-SNE completed. Processed {n_samples} samples with perplexity {perplexity}.")
+    end_time = time.time()  # End timing
+    st.sidebar.text(f't-SNE completed in {end_time - start_time:.3f} seconds')
+    return tsne_results
+def perform_clustering(df, tsne_results):
+    start_time = time.time()
+    # Perform KMeans clustering
+    logging.info('Performing k-means clustering...')
+    # Step 3: Visualization with Plotly
+    df['tsne-3d-one'] = tsne_results[:,0]
+    df['tsne-3d-two'] = tsne_results[:,1]
+    df['tsne-3d-three'] = tsne_results[:,2]
+    # Perform KMeans clustering
+    kmeans = KMeans(n_clusters=16)  # Change the number of clusters as needed
+    df['cluster'] = kmeans.fit_predict(df[['tsne-3d-one', 'tsne-3d-two', 'tsne-3d-three']])
+    end_time = time.time()  # End timing
+    st.sidebar.text(f'k-means clustering completed in {end_time - start_time:.3f} seconds')
+    return df
+def main():
+    # Custom CSS
+    custom_css = """
+    <style>
+        /* Define the font */
+        @font-face {
+            font-family: 'F';
+            src: url('https://fonts.googleapis.com/css2?family=Martian+Mono&display=swap') format('truetype');
+        }
+        /* Apply the font to all elements */
+        * {
+            font-family: 'F', sans-serif !important;
+            color: #F8F8F8; /* Set the font color to F8F8F8 */
+        }
+        /* Add your CSS styles here */
+        h1 {
+            text-align: center;
+        }
+        h2,h3,h4 {
+            text-align: justify;
+            font-size: 8px
+        }
+        body {
+            text-align: justify;
+        }
+        .stSlider .css-1cpxqw2 {
+            background: #202020;
+        }
+        .stButton > button {
+            background-color: #202020;
+            width: 100%;
+            border: none;
+            padding: 10px 24px;
+            border-radius: 5px;
+            font-size: 16px;
+            font-weight: bold;
+        }
+        .reportview-container .main .block-container {
+            padding: 2rem;
+            background-color: #202020;
+        }
+    </style>
+    """
+    # Inject custom CSS with markdown
+    st.markdown(custom_css, unsafe_allow_html=True)
+    st.sidebar.markdown(
+        f'<img src="https://www.somewhere.systems/S2-white-logo.png" style="float: bottom-left; width: 32px; height: 32px; opacity: 1.0; animation: fadein 2s;">',
+        unsafe_allow_html=True
+    )
+    st.sidebar.title('Spatial Search Engine')
+    # Check if data needs to be loaded
+    if 'data_loaded' not in st.session_state or not st.session_state.data_loaded:
+        # User input for number of samples
+        num_samples = st.sidebar.slider('Select number of samples', 1000, total_samples, 1000)
+        if st.sidebar.button('Initialize'):
+            st.sidebar.text('Initializing data pipeline...')
+            # Define a function to reshape the embeddings and add FAISS index if it doesn't exist
+            def reshape_and_add_faiss_index(dataset, column_name):
+                # Ensure the shape of the embedding is (1000, 384) and not (1000, 1, 384)
+                # As each row in title_embedding is shaped like this: [[-0.08477783203125, -0.009719848632812, ...]]
+                # We need to flatten it to [-0.08477783203125, -0.009719848632812, ...]
+                print(f"Flattening {column_name} and adding FAISS index...")
+                # Flatten the embeddings
+                dataset[column_name] = dataset[column_name].apply(lambda x: np.array(x).flatten())
+                # Add the FAISS index
+                dataset = Dataset.from_pandas(dataset).add_faiss_index(column=column_name)
+                print(f"FAISS index for {column_name} added.")
+                return dataset
+            # Load data and perform t-SNE and clustering
+            df, embeddings = load_data(num_samples)
+            # Combine embeddings and df back into one df
+            # Convert embeddings to list of lists before assigning to df
+            embeddings_list = [embedding.flatten().tolist() for embedding in embeddings]
+            df['title_embedding'] = embeddings_list
+            # Print the first few rows of the dataframe to check
+            print(df.head())
+            # Add FAISS indices for 'title_embedding'
+            st.session_state.dataclysm_title_indexed = reshape_and_add_faiss_index(df, 'title_embedding')
+            tsne_results = perform_tsne(embeddings)
+            df = perform_clustering(df, tsne_results)
+            # Store results in session state
+            st.session_state.df = df
+            st.session_state.tsne_results = tsne_results
+            st.session_state.data_loaded = True
+            # Create custom hover text
+            df['hovertext'] = df.apply(
+                lambda row: f"<b>Title:</b> {row['title']}<br><b>arXiv ID:</b> {row['id']}<br><b>Key:</b> {row.name}", axis=1
+            )
+            st.sidebar.text("Datasets loaded, titles indexed.")
+            # Create the plot
+            fig = go.Figure(data=[go.Scatter3d(
+                x=df['tsne-3d-one'],
+                y=df['tsne-3d-two'],
+                z=df['tsne-3d-three'],
+                mode='markers',
+                hovertext=df['hovertext'],
+                hoverinfo='text',
+                marker=dict(
+                    size=1,
+                    color=df['cluster'],
+                    colorscale='Viridis',
+                    opacity=0.8
+                )
+            )])
+            fig.update_layout(
+                plot_bgcolor='#202020',
+                height=800,
+                margin=dict(l=0, r=0, b=0, t=0),
+                scene=dict(
+                    xaxis=dict(showbackground=True, backgroundcolor="#000000"),
+                    yaxis=dict(showbackground=True, backgroundcolor="#000000"),
+                    zaxis=dict(showbackground=True, backgroundcolor="#000000"),
+                ),
+                scene_camera=dict(eye=dict(x=0.001, y=0.001, z=0.001))
+            )
+            st.session_state.fig = fig
+    # Display the plot if data is loaded
+    if 'data_loaded' in st.session_state and st.session_state.data_loaded:
+        st.plotly_chart(st.session_state.fig, use_container_width=True)
+    # Sidebar for detailed view
+    if 'df' in st.session_state:
+        # Sidebar for querying
+        with st.sidebar:
+            st.sidebar.markdown("### Query Embeddings")
+            query = st.text_input("Enter your query:")
+            if st.button("Search"):
+                # Define the model
+                print("Initializing model...")
+                model = FlagModel('BAAI/bge-small-en-v1.5',
+                                query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
+                                use_fp16=True)
+                print("Model initialized.")
+                query_embedding = model.encode([query])
+                # Retrieve examples by title similarity (or abstract, depending on your preference)
+                scores_title, retrieved_examples_title = st.session_state.dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
+                df_query = pd.DataFrame(retrieved_examples_title)
+                df_query['proximity'] = scores_title
+                df_query = df_query.sort_values(by='proximity', ascending=True)
+                # Limit similarity score to 3 decimal points
+                df_query['proximity'] = df_query['proximity'].round(3)
+                # Fix the <a href link> to display properly
+                df_query['URL'] = df_query['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}" target="_blank">Link</a>')
+                st.sidebar.markdown(df_query[['title', 'proximity', 'id']].to_html(escape=False), unsafe_allow_html=True)
+            st.sidebar.markdown("# Detailed View")
+            selected_index = st.sidebar.selectbox("Select Key", st.session_state.df.id)
+            # Display metadata for the selected article
+            selected_row = st.session_state.df[st.session_state.df['id'] == selected_index].iloc[0]
+            st.markdown(f"### Title\n{selected_row['title']}", unsafe_allow_html=True)
+            st.markdown(f"### Abstract\n{selected_row['abstract']}", unsafe_allow_html=True)
+            st.markdown(f"[Read the full paper](https://arxiv.org/abs/{selected_row['id']})", unsafe_allow_html=True)
+            st.markdown(f"[Download PDF](https://arxiv.org/pdf/{selected_row['id']})", unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,308 @@

+accelerate==0.25.0
+aiofiles==23.2.1
+aiohttp==3.9.1
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.2.0
+apache-beam==2.52.0
+appdirs==1.4.4
+appnope==0.1.3
+asgiref==3.7.2
+astor==0.8.1
+asttokens==2.4.1
+attrs==23.2.0
+backoff==2.2.1
+beautifulsoup4==4.12.2
+bitsandbytes==0.42.0
+blessed==1.20.0
+blinker==1.7.0
+boto==2.49.0
+build==1.0.3
+CacheControl==0.13.1
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+ci-info==0.3.0
+cleo==2.1.0
+click==8.1.7
+cloudpickle==2.2.1
+colorama==0.4.6
+comm==0.2.0
+configobj==5.0.8
+configparser==6.0.0
+contourpy==1.2.0
+crashtest==0.4.1
+crcmod==1.7
+cryptography==41.0.7
+cycler==0.12.1
+dataclasses==0.6
+dataclasses-json==0.6.3
+datasets==2.14.7
+debugpy==1.8.0
+decorator==5.1.1
+Deprecated==1.2.14
+dill==0.3.7
+diskcache==5.6.3
+distlib==0.3.8
+distro==1.9.0
+dnspython==2.4.2
+docarray==0.40.0
+docker==7.0.0
+docker-pycreds==0.4.0
+docopt==0.6.2
+dulwich==0.21.7
+ecdsa==0.18.0
+editor==1.6.5
+etelemetry==0.3.1
+executing==2.0.1
+faiss-cpu==1.7.4
+fastapi==0.108.0
+fastavro==1.9.2
+fasteners==0.19
+fastjsonschema==2.19.1
+filelock==3.13.1
+fitz==0.0.1.dev2
+FlagEmbedding==1.1.8
+fonttools==4.47.0
+frontend==0.0.3
+frozenlist==1.4.1
+fsspec==2023.10.0
+future==0.18.3
+gcs-oauth2-boto-plugin==3.0
+git-python==1.0.3
+gitdb==4.0.11
+GitPython==3.1.40
+google-apitools==0.5.32
+google-auth==2.26.2
+google-reauth==0.1.1
+googleapis-common-protos==1.62.0
+greenlet==3.0.3
+grpcio==1.57.0
+grpcio-health-checking==1.57.0
+grpcio-reflection==1.57.0
+gsutil==5.27
+h11==0.14.0
+hdfs==2.7.3
+hf_transfer==0.1.4
+html2image==2.0.4.3
+httpcore==1.0.2
+httplib2==0.20.4
+httptools==0.6.1
+httpx==0.26.0
+huggingface-hub==0.17.3
+idna==3.6
+importlib-metadata==6.11.0
+inquirer==3.2.1
+installer==0.7.0
+isodate==0.6.1
+itsdangerous==2.1.2
+jaraco.classes==3.3.0
+jcloud==0.3
+jedi==0.19.1
+jina==3.23.2
+jina-hubble-sdk==0.39.0
+Jinja2==3.1.2
+joblib==1.3.2
+Js2Py==0.74
+jsonschema==4.20.0
+jsonschema-specifications==2023.12.1
+jupyter_client==8.6.0
+jupyter_core==5.5.1
+keyring==24.3.0
+kiwisolver==1.4.5
+litellm==1.16.19
+llama-index==0.9.24
+llama_cpp_python==0.2.26
+looseversion==1.3.0
+lxml==5.0.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+matplotlib==3.8.2
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+monotonic==1.6
+more-itertools==10.1.0
+MouseInfo==0.1.3
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.15
+mwparserfromhell==0.6.5
+mypy-extensions==1.0.0
+nest-asyncio==1.5.8
+networkx==3.2.1
+nibabel==5.2.0
+nipype==1.8.6
+nltk==3.8.1
+numpy==1.26.2
+oauth2client==4.1.3
+objsize==0.6.1
+open-interpreter==0.2.0
+openai==1.6.1
+opencv-python==4.9.0.80
+opentelemetry-api==1.19.0
+opentelemetry-exporter-otlp==1.19.0
+opentelemetry-exporter-otlp-proto-common==1.19.0
+opentelemetry-exporter-otlp-proto-grpc==1.19.0
+opentelemetry-exporter-otlp-proto-http==1.19.0
+opentelemetry-exporter-prometheus==0.41b0
+opentelemetry-instrumentation==0.40b0
+opentelemetry-instrumentation-aiohttp-client==0.40b0
+opentelemetry-instrumentation-asgi==0.40b0
+opentelemetry-instrumentation-fastapi==0.40b0
+opentelemetry-instrumentation-grpc==0.40b0
+opentelemetry-proto==1.19.0
+opentelemetry-sdk==1.19.0
+opentelemetry-semantic-conventions==0.40b0
+opentelemetry-util-http==0.40b0
+orjson==3.9.10
+packaging==23.2
+pandas==2.1.4
+parso==0.8.3
+pathlib==1.0.1
+pathspec==0.12.1
+pdfminer.six==20221105
+pdfplumber==0.10.3
+peft==0.7.1
+pexpect==4.9.0
+Pillow==10.1.0
+pkginfo==1.9.6
+platformdirs==4.0.0
+plotly==5.18.0
+plyer==2.1.0
+poetry==1.7.1
+poetry-core==1.8.1
+poetry-plugin-export==1.6.0
+posthog==3.1.0
+pretty-traceback==2023.1020
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+proto-plus==1.23.0
+protobuf==4.25.1
+prov==2.0.0
+psutil==5.9.7
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==11.0.0
+pyarrow-hotfix==0.6
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+PyAutoGUI==0.9.54
+pydantic==2.5.3
+pydantic-settings==2.1.0
+pydantic_core==2.14.6
+pydeck==0.8.1b0
+pydot==1.4.2
+PyGetWindow==0.0.9
+Pygments==2.17.2
+pyjsparser==2.7.1
+PyMonCtl==0.7
+pymongo==4.6.1
+PyMsgBox==1.0.9
+pyopencl==2023.1.4
+pyOpenSSL==23.3.0
+pypandoc==1.12
+pyparsing==3.1.1
+pypdf==3.17.4
+PyPDF2==3.0.1
+pypdfium2==4.25.0
+pyperclip==1.8.2
+pyproject_hooks==1.0.0
+PyRect==0.2.0
+PyScreeze==0.1.30
+pytesseract==0.3.10
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-jose==3.3.0
+python-multipart==0.0.6
+pytils==0.4.1
+pytools==2023.1.1
+pytweening==1.0.7
+pytz==2023.3.post1
+pyu2f==0.1.5
+PyWinBox==0.6
+PyWinCtl==0.3
+pyxnat==1.6
+PyYAML==6.0.1
+pyzmq==25.1.2
+rapidfuzz==3.6.1
+ray==2.9.0
+rdflib==7.0.0
+readchar==4.0.5
+referencing==0.32.0
+regex==2023.12.25
+requests==2.31.0
+requests-toolbelt==1.0.0
+retry-decorator==1.1.1
+rich==13.7.0
+rpds-py==0.16.2
+rsa==4.7.2
+rubicon-objc==0.4.7
+runs==1.2.0
+safetensors==0.4.1
+scikit-learn==1.3.2
+scipy==1.11.4
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+sentry-sdk==1.39.2
+setproctitle==1.3.3
+shellingham==1.5.4
+simplejson==3.19.2
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+soupsieve==2.5
+SQLAlchemy==2.0.24
+sse-starlette==1.8.2
+stack-data==0.6.3
+starlette==0.32.0.post1
+starlette-context==0.3.6
+streamlit==1.30.0
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.2.0
+tiktoken==0.4.0
+tinygrad==0.7.0
+tokenizers==0.14.1
+tokentrim==0.1.13
+toml==0.10.2
+tomlkit==0.12.3
+tools==0.1.9
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.14.0
+traits==6.3.2
+transformers==4.34.0
+trove-classifiers==2023.11.29
+types-requests==2.31.0.6
+types-urllib3==1.26.25.14
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.4
+tzlocal==5.2
+urllib3==2.1.0
+uvicorn==0.24.0.post1
+uvloop==0.19.0
+validators==0.22.0
+virtualenv==20.25.0
+wandb==0.16.2
+watchdog==3.0.0
+watchfiles==0.21.0
+wcwidth==0.2.12
+websocket-client==1.7.0
+websockets==12.0
+wget==3.2
+wrapt==1.16.0
+xattr==0.10.1
+xmod==1.8.1
+xxhash==3.4.1
+yarl==1.9.4
+youtube-dl==2021.12.17
+zipp==3.17.0
+zstandard==0.22.0