Spaces:

vanessbut
/

tldr_keywords

Build error

App Files Files Community

vanessbut commited on Mar 27, 2022

Commit

1b80991

1 Parent(s): dba1b96

Добавлена основная часть кода.

Browse files

Files changed (2) hide show

app.py +15 -5
utils.py +133 -0

app.py CHANGED Viewed

@@ -3,15 +3,25 @@ import streamlit as st
 st.markdown("""### TL;DR: give me the keywords!
 Here you can get the keywords and topic of the article based on it's title or abstract.""")
-st.markdown("<p style=\"text-align:center\"><img width=200px src='https://c.tenor.com/IKt-6tAk9CUAAAAd/thats-a-lot-of-words-lots-of-words.gif'></p>", unsafe_allow_html=True)
-from transformers import pipeline
-pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
 #st.markdown("#### Title:")
 title = st.text_area("Title:")
 abstract = st.text_area("abstract:")
-#st.markdown(f"{pipe(text)}")

 st.markdown("""### TL;DR: give me the keywords!
 Here you can get the keywords and topic of the article based on it's title or abstract.""")
+st.markdown("<p style=\"text-align:center\"><img width=700px src='https://c.tenor.com/IKt-6tAk9CUAAAAd/thats-a-lot-of-words-lots-of-words.gif'></p>", unsafe_allow_html=True)
+#from transformers import pipeline
+#pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
 #st.markdown("#### Title:")
 title = st.text_area("Title:")
 abstract = st.text_area("abstract:")
+import .utils
+import spacy
+# Вообще, стоит найти pipeline, заточенный под научный текст.
+# Но этим займёмся потом, если будет время.
+main_nlp = spacy.load('en_core_web_sm')
+text = title + abstract
+#text = preprocess(text)
+st.markdown(f"{get_candidates(text, main_nlp)}")

utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import re
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.metrics.pairwise import euclidean_distances
+from scipy.special import softmax
+def preprocess(strings):
+    """
+    Заменить символы '\n' на пробелы и убрать лишние пробелы.
+    strings - список строк.
+    """
+    for index in range(len(strings)):
+        strings[index] = strings[index].replace('\n', ' ')
+        strings[index] = re.sub(' +', ' ', strings[index])
+    return strings
+def get_candidates(text, nlp, min_df=0.0, ngram_range=(1, 3), max_words=None):
+    """
+    Получить список из max(max_words, #слов в text) кандидатов в ключевые слова.
+    text - входной текст.
+    nlp  - инструмент для анализа языка (см. spacy)
+    min_df      - минимальная частота вхождения слова в текст.
+    ngram_range - число грам в ключевом слове.
+    max_words   - максимальное число слов на выходе.
+    """
+    # Получим самый базовый набор грам.
+    count = CountVectorizer(ngram_range=ngram_range,
+                            stop_words="english",
+                            min_df=min_df,
+                            max_features=max_words).fit([text])
+    candidates = count.get_feature_names()
+    #print(candidates)
+    # Обработаем полученный список.
+    nlp_result = nlp(text)
+    # Фразы, содержащие существительные.
+    noun_phrases = set(chunk.text.strip().lower() for chunk in nlp_result.noun_chunks)
+    #print(noun_phrases)
+    # Отдельно существительные.
+    noun_lemmas = set()
+    for token in nlp_result:
+        if token.pos_ == "NOUN":
+            noun_lemmas.add(token.lemma_) # Для одного слова всё-таки бессмысленно хранить форму.
+    #print(noun_lemmas)
+    nouns = set()
+    for token in nlp_result:
+        if token.pos_ == "NOUN" and not (token.text in noun_lemmas):
+            nouns.add(token.text)
+    #print(nouns)
+    nouns = nouns.union(noun_lemmas)
+    # Объединение.
+    with_nouns = nouns.union(noun_phrases)
+    # Отфильтровывание.
+    candidates = list(filter(lambda candidate: candidate in with_nouns, candidates))
+    return candidates
+def get_embedding(texts, model, tokenizer, chunk_size=128):
+    """
+    Перевести набор текстов в эмбеддинги.
+    """
+    n_chunks = len(texts) // chunk_size + int(len(texts) % chunk_size != 0)
+    embeddings = []
+    for chunk_index in range(n_chunks):
+        start = chunk_index * chunk_size
+        end   = min(start + chunk_size, len(texts))
+        chunk = texts[start:end]
+        chunk_tokens = tokenizer(chunk, padding=True, truncation=True, return_tensors="pt")
+        chunk_embeddings = model(**chunk_tokens)["pooler_output"]
+        chunk_embeddings = chunk_embeddings.detach().numpy()
+        embeddings.append(chunk_embeddings)
+    embeddings = np.vstack(embeddings)
+    return embeddings
+def score_candidates(text, candidates, model, tokenizer):
+    """
+    Ранжирование ключевых слов.
+    """
+    if len(candidates) == 1:
+        return np.array([1.0])
+    elif len(candidates) == 0:
+        return np.array([])
+    # Эмбеддинг для текста.
+    text_embedding = get_embedding([text], model, tokenizer)
+    # Эмбеддинг для ключевых слов.
+    candidate_embeddings = get_embedding(candidates, model, tokenizer)
+    # Будем брать softmax от нормированных косинусных расстояний.
+    distances = cosine_similarity(text_embedding, candidate_embeddings)
+    score = softmax((distances - np.mean(distances)) / np.std(distances))[0]
+    return score
+def get_keywords(text, nlp, model, tokenizer, top=0.95, max_words=None):
+    candidates = get_candidates(text, nlp)
+    score = score_candidates(text, candidates, model, tokenizer)
+    candidates_scored = [(candidates[index], score[index]) for index in score.argsort()[::-1]]
+    result = []
+    sum_probability = 0.0
+    max_words = len(candidates_scored) if max_words is None else min(len(candidates_scored), max_words)
+    for index in range(max_words):
+        if sum_probability > top:
+            break
+        result.append(candidates_scored[index])
+        sum_probability += candidates_scored[index][1]
+    return result