myshirk commited on
Commit
ff974ba
1 Parent(s): 0deac40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -2
app.py CHANGED
@@ -1,7 +1,48 @@
1
  import streamlit as st
 
 
 
 
2
 
3
- def show_model(query):
4
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def main():
7
  st.title("Semantic Search for Datasets Using Sentence Transformers")
@@ -17,6 +58,7 @@ def main():
17
  st.write("Uses service called OneStop for data search")
18
  st.write("**Problems:**")
19
  st.write("- Uses keyword search -- not robust to natural language queries")
 
20
  #st.image("pres-onestop.png")
21
  #st.image("pres-problems.png")
22
 
 
1
  import streamlit as st
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ import requests
6
 
7
+ def find_abstracts(soup):
8
+ #df = pd.DataFrame(columns = ["identifier", "abstract"])
9
+ id_list = []
10
+ abs_list = []
11
+ title_list = []
12
+
13
+ for record in soup.find_all("csw:record"):
14
+ id = record.find("dc:identifier")
15
+ abs = record.find("dct:abstract")
16
+ title = record.find("dc:title")
17
+
18
+ # append id and abs to df
19
+ #df = df.append([id.text, abs.text])
20
+ id_list.append(id.text)
21
+ title_list.append(title.text)
22
+
23
+ if abs != None:
24
+ abs_list.append(abs.text)
25
+ else:
26
+ abs_list.append("NA")
27
+
28
+ return id_list, title_list, abs_list
29
+
30
+ def get_metadata():
31
+ # Get the abstracts from Geoportal
32
+ URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort"
33
+
34
+ page = requests.get(URL)
35
+ soup = BeautifulSoup(page.text, "lxml")
36
+
37
+ id_list, title_list, abs_list = find_abstracts(soup)
38
+ df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"])
39
+ df.to_csv("./ncei-metadata.csv")
40
+
41
+ return df
42
+
43
+ def show_model():
44
+
45
+ return
46
 
47
  def main():
48
  st.title("Semantic Search for Datasets Using Sentence Transformers")
 
58
  st.write("Uses service called OneStop for data search")
59
  st.write("**Problems:**")
60
  st.write("- Uses keyword search -- not robust to natural language queries")
61
+ st.write("- Filtering options too specific for non-expert users")
62
  #st.image("pres-onestop.png")
63
  #st.image("pres-problems.png")
64