Spaces:

dglin
/

hacker_news_recommendations_hfs

Running

App Files Files Community

Me commited on Mar 19

Commit

721a732

•

1 Parent(s): 53391cc

Added demo

Browse files

Files changed (4) hide show

README.md +2 -13
app.py +30 -0
recommend.py +121 -0
requirements.txt +3 -0

README.md CHANGED Viewed

@@ -1,13 +1,2 @@
----
-title: Hacker News Recommendations
-emoji: 🏃
-colorFrom: yellow
-colorTo: blue
-sdk: gradio
-sdk_version: 4.22.0
-app_file: app.py
-pinned: false
-license: bsd-3-clause
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # hacker_news_recommendations
2	+ Recommending Hacker News articles based on user bios

app.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import gradio as gr
+import nltk
+from recommend import get_top_headlines, rank_headlines
+stopwords = nltk.corpus.stopwords.words('english')
+top_headlines = get_top_headlines()
+def rank(bio):
+    """
+    Wrapper function for ranking the top headlines
+    PARAMETERS:
+        - bio (str): user bio to base rankings off of
+    RETURNS:
+        - df_rank (polars.DataFrame): DataFrame with headlines in the
+            'headlines' column and ranking in the 'rank' column
+    """
+    return rank_headlines(bio, top_headlines)
+if __name__ == '__main__':
+    demo = gr.Interface(
+        fn=rank,
+        inputs=[gr.Textbox(label='Provide a bio describing your interests')],
+        outputs=[gr.Dataframe(label='Recommended Hacker News Articles')]
+    )
+    demo.launch()

recommend.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from nltk.tag import pos_tag
+from nltk.tokenize import word_tokenize
+import nltk
+import polars as pl
+import requests
+def preprocess_bio(bio, stopwords):
+    """
+    Pre-processes a bio by POS-tagging, removing stopwords, and extracting just the nouns
+    PARAMETERS:
+        - bio (str): string to POS-tag, remove stopwords from, and extract
+            nouns from
+        - stopwords (list of str): stopwords to remove from bio
+    RETURNS:
+        - s_nouns (list of str): list of non-stopword nouns in bio
+    """
+    NOUN_POS_TAGS = ('NN', 'NNS', 'NNP', 'NNPS')
+    tagged_bio = pos_tag(word_tokenize(bio.lower()))
+    # keep only the nouns
+    s_nouns = [t.lower() for (t, pos) in tagged_bio if pos in NOUN_POS_TAGS]
+    s_nouns = [t for t in s_nouns if t not in stopwords]
+    return s_nouns
+def _preprocess_headline(headline):
+    """
+    Pre-processes a headline by lower-casing it and tokenizing it
+    PARAMETERS:
+        - headline (str): the headline to pre-process
+    RETURNS:
+        - l_headline_tokens (list of str): list of lower-cased
+            tokens in headline
+    """
+    headline_lower = headline.lower()
+    l_headline_tokens = word_tokenize(headline_lower)
+    return l_headline_tokens
+def count_overlap(l_bio_nouns, headline):
+    """
+    Counts the number of nouns in common between the list of nouns from
+    a bio and a headline
+    PARAMETERS:
+        - l_bio_nouns (list of str): list of the nouns in the bio
+        - headline (str): the headline to pre-process
+    RETURNS:
+        - num_overlap (int): how many nouns are in both, ignoring repeated nouns
+    """
+    l_headline_tokens = _preprocess_headline(headline)
+    s_headline_tokens = set(l_headline_tokens)
+    s_bio_nouns = set(l_bio_nouns)
+    overlapping_tokens = s_headline_tokens.intersection(s_bio_nouns)
+    return len(overlapping_tokens)
+def get_top_headlines():
+    """
+    Returns the headlines of the top 500 articles on Hacker News
+    PARAMETERS:
+        - None
+    RETURNS:
+        - top_headlines (list of str): headlines of the top 500 articles
+    """
+    TOP_STORIES_URL = 'https://hacker-news.firebaseio.com/v0/topstories.json'
+    # pulling top 500 stories
+    top_stories = requests.get(TOP_STORIES_URL)
+    # go through the 500 top stories' ids to pull the headlines
+    top_headlines = []
+    for item_id in top_stories.json():
+        story_req = requests.get(f'https://hacker-news.firebaseio.com/v0/item/{item_id}.json')
+        top_headlines.append(story_req.json()['title'])
+    return top_headlines
+def rank_headlines(bio, headlines):
+    """
+    Ranks headlines for a bio
+    PARAMETERS:
+        - bio (str): user bio to base rankings off of
+        - headlines (list of str): headlines to rank
+    RETURNS:
+        - df_rank (polars.DataFrame): DataFrame with headlines in the
+            'headlines' column and ranking in the 'rank' column
+    """
+    df_rank = pl.DataFrame({
+        'headlines': headlines
+    })
+    # pre-process the bio
+    stopwords = nltk.corpus.stopwords.words('english')
+    l_bio_nouns = preprocess_bio(bio, stopwords)
+    # calculate each headlines' score
+    df_rank = df_rank.with_columns(
+        pl.col('headlines').map_elements(lambda h: count_overlap(l_bio_nouns, h)).alias('scores')
+    )
+    # rank based off the score
+    df_rank = df_rank.with_columns(
+        pl.col('scores').rank(method='min', descending=True).alias('rank')
+    )
+    df_rank = df_rank.drop('scores')
+    df_rank = df_rank.sort(by='rank')
+    return df_rank

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio==4.21.0
+nltk==3.8.1
+polars==0.20.15