awacke1 commited on
Commit
5891c7a
1 Parent(s): 436442c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -19
app.py CHANGED
@@ -46,25 +46,6 @@ def load_chart_top_tags():
46
 
47
  fig_top_tags = load_chart_top_tags()
48
 
49
- # collapse option to see more info about the data
50
- with st.expander("See more info about data"):
51
- st.markdown("### Where can I find the data")
52
- st.markdown("You can find the data as a Hugging Face dataset [here](https://huggingface.co/datasets/fabiochiu/medium-articles).")
53
- st.markdown(f"### The {n_top_tags} most occurring tags and their frequencies")
54
- st.plotly_chart(fig_top_tags, use_container_width=True)
55
- st.markdown(f"### Dataset creation")
56
- st.markdown("The articles have been scraped with Python and the [requests](https://pypi.org/project/requests/) library. Because of the scraping process, scraped articles are coming from a not uniform publication date distribution. This means that there are articles published in 2016 and in 2022, but the number of articles in this dataset published in 2016 is not the same as the number of articles published in 2022. In particular, there is a strong prevalence of articles published in 2020. Have a look at the [accompanying notebook](https://www.kaggle.com/code/fabiochiusano/medium-articles-simple-data-analysis) to see the distribution of the publication dates.")
57
-
58
- # collapse option to see a comparison between different search engine types
59
- with st.expander("Semantic search engine vs Text match search engine"):
60
- st.markdown("""
61
- Here's a brief comparison between them:
62
- - Generally, a semantic search engine works better than a text-matching search engine, as the latter (1) looks for only exact text matches between the articles and the query after some [text normalization](https://towardsdatascience.com/text-normalization-for-natural-language-processing-nlp-70a314bfa646) and (2) it doesn't take into account synonyms, etc.
63
- - The quality difference is higher if the corpus of articles is small (e.g. hundreds or thousands), because a text-matching search engine may return zero-or-few results for some queries, while a semantic search engine always returns an ordered list of articles.
64
- - On the other hand, a semantic search engine needs all the documents in the corpus to be embedded (i.e. transformed into semantic vectors thanks to a machine learning model) as a setup step, but this has to be done only once so it's not really a problem.
65
- - Using appropriate data structures that implement [fast approximate nearest neighbors algorithms](https://towardsdatascience.com/comprehensive-guide-to-approximate-nearest-neighbors-algorithms-8b94f057d6b6), both types of search engines can have low latencies.
66
- """)
67
-
68
  st_query = st.text_input("Write your query here", max_chars=100)
69
 
70
  def on_click_search():
 
46
 
47
  fig_top_tags = load_chart_top_tags()
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  st_query = st.text_input("Write your query here", max_chars=100)
50
 
51
  def on_click_search():