Me commited on
Commit
721a732
β€’
1 Parent(s): 53391cc

Added demo

Browse files
Files changed (4) hide show
  1. README.md +2 -13
  2. app.py +30 -0
  3. recommend.py +121 -0
  4. requirements.txt +3 -0
README.md CHANGED
@@ -1,13 +1,2 @@
1
- ---
2
- title: Hacker News Recommendations
3
- emoji: πŸƒ
4
- colorFrom: yellow
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 4.22.0
8
- app_file: app.py
9
- pinned: false
10
- license: bsd-3-clause
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # hacker_news_recommendations
2
+ Recommending Hacker News articles based on user bios
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import nltk
3
+ from recommend import get_top_headlines, rank_headlines
4
+
5
+ stopwords = nltk.corpus.stopwords.words('english')
6
+ top_headlines = get_top_headlines()
7
+
8
+
9
+ def rank(bio):
10
+ """
11
+ Wrapper function for ranking the top headlines
12
+
13
+ PARAMETERS:
14
+ - bio (str): user bio to base rankings off of
15
+
16
+ RETURNS:
17
+ - df_rank (polars.DataFrame): DataFrame with headlines in the
18
+ 'headlines' column and ranking in the 'rank' column
19
+ """
20
+ return rank_headlines(bio, top_headlines)
21
+
22
+
23
+ if __name__ == '__main__':
24
+ demo = gr.Interface(
25
+ fn=rank,
26
+ inputs=[gr.Textbox(label='Provide a bio describing your interests')],
27
+ outputs=[gr.Dataframe(label='Recommended Hacker News Articles')]
28
+ )
29
+
30
+ demo.launch()
recommend.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.tag import pos_tag
2
+ from nltk.tokenize import word_tokenize
3
+ import nltk
4
+ import polars as pl
5
+ import requests
6
+
7
+
8
+ def preprocess_bio(bio, stopwords):
9
+ """
10
+ Pre-processes a bio by POS-tagging, removing stopwords, and extracting just the nouns
11
+
12
+ PARAMETERS:
13
+ - bio (str): string to POS-tag, remove stopwords from, and extract
14
+ nouns from
15
+ - stopwords (list of str): stopwords to remove from bio
16
+
17
+ RETURNS:
18
+ - s_nouns (list of str): list of non-stopword nouns in bio
19
+ """
20
+ NOUN_POS_TAGS = ('NN', 'NNS', 'NNP', 'NNPS')
21
+
22
+ tagged_bio = pos_tag(word_tokenize(bio.lower()))
23
+
24
+ # keep only the nouns
25
+ s_nouns = [t.lower() for (t, pos) in tagged_bio if pos in NOUN_POS_TAGS]
26
+ s_nouns = [t for t in s_nouns if t not in stopwords]
27
+ return s_nouns
28
+
29
+
30
+ def _preprocess_headline(headline):
31
+ """
32
+ Pre-processes a headline by lower-casing it and tokenizing it
33
+
34
+ PARAMETERS:
35
+ - headline (str): the headline to pre-process
36
+
37
+ RETURNS:
38
+ - l_headline_tokens (list of str): list of lower-cased
39
+ tokens in headline
40
+ """
41
+ headline_lower = headline.lower()
42
+ l_headline_tokens = word_tokenize(headline_lower)
43
+ return l_headline_tokens
44
+
45
+
46
+ def count_overlap(l_bio_nouns, headline):
47
+ """
48
+ Counts the number of nouns in common between the list of nouns from
49
+ a bio and a headline
50
+
51
+ PARAMETERS:
52
+ - l_bio_nouns (list of str): list of the nouns in the bio
53
+ - headline (str): the headline to pre-process
54
+
55
+ RETURNS:
56
+ - num_overlap (int): how many nouns are in both, ignoring repeated nouns
57
+ """
58
+ l_headline_tokens = _preprocess_headline(headline)
59
+ s_headline_tokens = set(l_headline_tokens)
60
+ s_bio_nouns = set(l_bio_nouns)
61
+ overlapping_tokens = s_headline_tokens.intersection(s_bio_nouns)
62
+ return len(overlapping_tokens)
63
+
64
+
65
+ def get_top_headlines():
66
+ """
67
+ Returns the headlines of the top 500 articles on Hacker News
68
+
69
+ PARAMETERS:
70
+ - None
71
+
72
+ RETURNS:
73
+ - top_headlines (list of str): headlines of the top 500 articles
74
+ """
75
+ TOP_STORIES_URL = 'https://hacker-news.firebaseio.com/v0/topstories.json'
76
+
77
+ # pulling top 500 stories
78
+ top_stories = requests.get(TOP_STORIES_URL)
79
+
80
+ # go through the 500 top stories' ids to pull the headlines
81
+ top_headlines = []
82
+
83
+ for item_id in top_stories.json():
84
+ story_req = requests.get(f'https://hacker-news.firebaseio.com/v0/item/{item_id}.json')
85
+ top_headlines.append(story_req.json()['title'])
86
+
87
+ return top_headlines
88
+
89
+
90
+ def rank_headlines(bio, headlines):
91
+ """
92
+ Ranks headlines for a bio
93
+
94
+ PARAMETERS:
95
+ - bio (str): user bio to base rankings off of
96
+ - headlines (list of str): headlines to rank
97
+
98
+ RETURNS:
99
+ - df_rank (polars.DataFrame): DataFrame with headlines in the
100
+ 'headlines' column and ranking in the 'rank' column
101
+ """
102
+ df_rank = pl.DataFrame({
103
+ 'headlines': headlines
104
+ })
105
+
106
+ # pre-process the bio
107
+ stopwords = nltk.corpus.stopwords.words('english')
108
+ l_bio_nouns = preprocess_bio(bio, stopwords)
109
+
110
+ # calculate each headlines' score
111
+ df_rank = df_rank.with_columns(
112
+ pl.col('headlines').map_elements(lambda h: count_overlap(l_bio_nouns, h)).alias('scores')
113
+ )
114
+
115
+ # rank based off the score
116
+ df_rank = df_rank.with_columns(
117
+ pl.col('scores').rank(method='min', descending=True).alias('rank')
118
+ )
119
+ df_rank = df_rank.drop('scores')
120
+ df_rank = df_rank.sort(by='rank')
121
+ return df_rank
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==4.21.0
2
+ nltk==3.8.1
3
+ polars==0.20.15