Spaces:

myshirk
/

semantic-search-datasets

Runtime error

File size: 3,767 Bytes

673bdce
ff974ba
 
 
 
6370188
 
673bdce
ff974ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6370188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
673bdce
 
93f465d
 
b48f7ea
 
fc36661
054a10b
b48f7ea
a01266f
ee93acc
0deac40
 
ee93acc
 
ff974ba
ee93acc
 
054a10b
31a82df
6415a43
054a10b
fc36661
a01266f
31a82df
 
a01266f
054a10b
fc36661
054a10b
aea57ec
c4b2b97
a01266f
8cd59e7
 
aea57ec
8cd59e7
 
673bdce

import streamlit as st
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import time

def find_abstracts(soup):
  #df = pd.DataFrame(columns = ["identifier", "abstract"])
  id_list = []
  abs_list = []
  title_list = []

  for record in soup.find_all("csw:record"):
    id = record.find("dc:identifier")
    abs = record.find("dct:abstract")
    title = record.find("dc:title")

    # append id and abs to df
    #df = df.append([id.text, abs.text])
    id_list.append(id.text)
    title_list.append(title.text)

    if abs != None:
      abs_list.append(abs.text)
    else:
      abs_list.append("NA")

  return id_list, title_list, abs_list

def get_metadata():
  # Get the abstracts from Geoportal
  URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort"
  
  page = requests.get(URL)
  soup = BeautifulSoup(page.text, "lxml")
  
  id_list, title_list, abs_list = find_abstracts(soup)
  df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"])
  df.to_csv("./ncei-metadata.csv")

  return df
  
def show_model(query):
  path = "./ncei-metadata.csv"
  
  if os.path.exists(path):
  
    last_modified = os.path.getmtime(path)
    now = time.time()
    DAY = 86400
    
    if (now - last_modified > DAY):
      df = get_metadata()
      
    else:
      df = pd.read_csv(path)
  else:
    df = get_metadata()
    
  
  # Make the abstracts the docs
  docs_df = df[df["abstract"] != "NA"]
  docs = list(docs_df["abstract"])
  titles = list(docs_df["title"])
  
  # Query
  query = input("Enter your query: ")
  
  # predict on a search query for data
  
  #Load the model
  model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
  
  #Encode query and documents
  query_emb = model.encode(query)
  doc_emb = model.encode(docs)
  
  #Compute dot score between query and all document embeddings
  scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
  
  #Combine docs & scores
  doc_score_pairs = list(zip(docs, scores, titles))
  
  #Sort by decreasing score
  doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
  return doc_score_pairs
  
def main():
  st.title("Semantic Search for Datasets Using Sentence Transformers")
  st.write("A case study for the National Centers for Environmental Information (NCEI)")
  st.image("noaa_logo.png", width=150)
  
  st.write("## Goal: search for datasets in NCEI's Archive using natural language queries")
  st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)")
  
  st.image("pres-whatisnoaa.png")
  
  st.write("## The Problem Context")
  st.write("Uses service called OneStop for data search")
  st.write("**Problems:**")
  st.write("- Uses keyword search -- not robust to natural language queries")
  st.write("- Filtering options too specific for non-expert users")
  #st.image("pres-onestop.png")
  #st.image("pres-problems.png")
  
  st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)")
  st.image("pres-sentencetransformers.png")
  
  st.write("## Project Data")
  st.image("pres-metadata.png")
  
  st.write("## The Process")
  st.image("pres-creatingse.png")
  
  st.write("## Results and Demo")
  
  st.write("[Demo Notebook](https://github.com/myrandaGoesToSpace/semantic-search-datasets/blob/main/semantic_search.ipynb)")

  st.image("pres-futureplans.png")
  
  st.write("## Critical Analysis")
  st.write("- did not run with Streamlit text input")
  st.write("- only embeds the first 5000 datasets")
  st.write("- calculates embeddings for datasets with each run")

main()