Me
commited on
Commit
β’
721a732
1
Parent(s):
53391cc
Added demo
Browse files- README.md +2 -13
- app.py +30 -0
- recommend.py +121 -0
- requirements.txt +3 -0
README.md
CHANGED
@@ -1,13 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: π
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: blue
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.22.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: bsd-3-clause
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# hacker_news_recommendations
|
2 |
+
Recommending Hacker News articles based on user bios
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import nltk
|
3 |
+
from recommend import get_top_headlines, rank_headlines
|
4 |
+
|
5 |
+
stopwords = nltk.corpus.stopwords.words('english')
|
6 |
+
top_headlines = get_top_headlines()
|
7 |
+
|
8 |
+
|
9 |
+
def rank(bio):
|
10 |
+
"""
|
11 |
+
Wrapper function for ranking the top headlines
|
12 |
+
|
13 |
+
PARAMETERS:
|
14 |
+
- bio (str): user bio to base rankings off of
|
15 |
+
|
16 |
+
RETURNS:
|
17 |
+
- df_rank (polars.DataFrame): DataFrame with headlines in the
|
18 |
+
'headlines' column and ranking in the 'rank' column
|
19 |
+
"""
|
20 |
+
return rank_headlines(bio, top_headlines)
|
21 |
+
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
demo = gr.Interface(
|
25 |
+
fn=rank,
|
26 |
+
inputs=[gr.Textbox(label='Provide a bio describing your interests')],
|
27 |
+
outputs=[gr.Dataframe(label='Recommended Hacker News Articles')]
|
28 |
+
)
|
29 |
+
|
30 |
+
demo.launch()
|
recommend.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.tag import pos_tag
|
2 |
+
from nltk.tokenize import word_tokenize
|
3 |
+
import nltk
|
4 |
+
import polars as pl
|
5 |
+
import requests
|
6 |
+
|
7 |
+
|
8 |
+
def preprocess_bio(bio, stopwords):
|
9 |
+
"""
|
10 |
+
Pre-processes a bio by POS-tagging, removing stopwords, and extracting just the nouns
|
11 |
+
|
12 |
+
PARAMETERS:
|
13 |
+
- bio (str): string to POS-tag, remove stopwords from, and extract
|
14 |
+
nouns from
|
15 |
+
- stopwords (list of str): stopwords to remove from bio
|
16 |
+
|
17 |
+
RETURNS:
|
18 |
+
- s_nouns (list of str): list of non-stopword nouns in bio
|
19 |
+
"""
|
20 |
+
NOUN_POS_TAGS = ('NN', 'NNS', 'NNP', 'NNPS')
|
21 |
+
|
22 |
+
tagged_bio = pos_tag(word_tokenize(bio.lower()))
|
23 |
+
|
24 |
+
# keep only the nouns
|
25 |
+
s_nouns = [t.lower() for (t, pos) in tagged_bio if pos in NOUN_POS_TAGS]
|
26 |
+
s_nouns = [t for t in s_nouns if t not in stopwords]
|
27 |
+
return s_nouns
|
28 |
+
|
29 |
+
|
30 |
+
def _preprocess_headline(headline):
|
31 |
+
"""
|
32 |
+
Pre-processes a headline by lower-casing it and tokenizing it
|
33 |
+
|
34 |
+
PARAMETERS:
|
35 |
+
- headline (str): the headline to pre-process
|
36 |
+
|
37 |
+
RETURNS:
|
38 |
+
- l_headline_tokens (list of str): list of lower-cased
|
39 |
+
tokens in headline
|
40 |
+
"""
|
41 |
+
headline_lower = headline.lower()
|
42 |
+
l_headline_tokens = word_tokenize(headline_lower)
|
43 |
+
return l_headline_tokens
|
44 |
+
|
45 |
+
|
46 |
+
def count_overlap(l_bio_nouns, headline):
|
47 |
+
"""
|
48 |
+
Counts the number of nouns in common between the list of nouns from
|
49 |
+
a bio and a headline
|
50 |
+
|
51 |
+
PARAMETERS:
|
52 |
+
- l_bio_nouns (list of str): list of the nouns in the bio
|
53 |
+
- headline (str): the headline to pre-process
|
54 |
+
|
55 |
+
RETURNS:
|
56 |
+
- num_overlap (int): how many nouns are in both, ignoring repeated nouns
|
57 |
+
"""
|
58 |
+
l_headline_tokens = _preprocess_headline(headline)
|
59 |
+
s_headline_tokens = set(l_headline_tokens)
|
60 |
+
s_bio_nouns = set(l_bio_nouns)
|
61 |
+
overlapping_tokens = s_headline_tokens.intersection(s_bio_nouns)
|
62 |
+
return len(overlapping_tokens)
|
63 |
+
|
64 |
+
|
65 |
+
def get_top_headlines():
|
66 |
+
"""
|
67 |
+
Returns the headlines of the top 500 articles on Hacker News
|
68 |
+
|
69 |
+
PARAMETERS:
|
70 |
+
- None
|
71 |
+
|
72 |
+
RETURNS:
|
73 |
+
- top_headlines (list of str): headlines of the top 500 articles
|
74 |
+
"""
|
75 |
+
TOP_STORIES_URL = 'https://hacker-news.firebaseio.com/v0/topstories.json'
|
76 |
+
|
77 |
+
# pulling top 500 stories
|
78 |
+
top_stories = requests.get(TOP_STORIES_URL)
|
79 |
+
|
80 |
+
# go through the 500 top stories' ids to pull the headlines
|
81 |
+
top_headlines = []
|
82 |
+
|
83 |
+
for item_id in top_stories.json():
|
84 |
+
story_req = requests.get(f'https://hacker-news.firebaseio.com/v0/item/{item_id}.json')
|
85 |
+
top_headlines.append(story_req.json()['title'])
|
86 |
+
|
87 |
+
return top_headlines
|
88 |
+
|
89 |
+
|
90 |
+
def rank_headlines(bio, headlines):
|
91 |
+
"""
|
92 |
+
Ranks headlines for a bio
|
93 |
+
|
94 |
+
PARAMETERS:
|
95 |
+
- bio (str): user bio to base rankings off of
|
96 |
+
- headlines (list of str): headlines to rank
|
97 |
+
|
98 |
+
RETURNS:
|
99 |
+
- df_rank (polars.DataFrame): DataFrame with headlines in the
|
100 |
+
'headlines' column and ranking in the 'rank' column
|
101 |
+
"""
|
102 |
+
df_rank = pl.DataFrame({
|
103 |
+
'headlines': headlines
|
104 |
+
})
|
105 |
+
|
106 |
+
# pre-process the bio
|
107 |
+
stopwords = nltk.corpus.stopwords.words('english')
|
108 |
+
l_bio_nouns = preprocess_bio(bio, stopwords)
|
109 |
+
|
110 |
+
# calculate each headlines' score
|
111 |
+
df_rank = df_rank.with_columns(
|
112 |
+
pl.col('headlines').map_elements(lambda h: count_overlap(l_bio_nouns, h)).alias('scores')
|
113 |
+
)
|
114 |
+
|
115 |
+
# rank based off the score
|
116 |
+
df_rank = df_rank.with_columns(
|
117 |
+
pl.col('scores').rank(method='min', descending=True).alias('rank')
|
118 |
+
)
|
119 |
+
df_rank = df_rank.drop('scores')
|
120 |
+
df_rank = df_rank.sort(by='rank')
|
121 |
+
return df_rank
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.21.0
|
2 |
+
nltk==3.8.1
|
3 |
+
polars==0.20.15
|