oroszgy's picture
Improved keyphrase extraction by PoS filtering and IDF scores
e2e35eb unverified
raw
history blame
No virus
883 Bytes
from typing import List, Tuple
import gradio as gr
import pandas as pd
from textacy.extract.keyterms.sgrank import sgrank as keywords
from examples.common import NLP, IDF, NEWS_EXAMPLES
def process(text: str) -> pd.DataFrame:
doc = NLP(text)
terms: List[Tuple[str, float]] = keywords(doc, topn=10, include_pos=("NOUN", "PROPN"), idf=IDF, ngrams=(1, 2, 3))
term_set = [t for t, _ in terms]
return pd.DataFrame([{"Keyphrase": term, "Score": prob}
for term, prob in terms
if all(other == term or term not in other for other in term_set)])
demo = gr.Interface(
fn=process,
inputs=gr.Textbox(value=NEWS_EXAMPLES[0], lines=10, label="Input text", show_label=True),
outputs=gr.DataFrame(label="Keywords", show_label=False, max_cols=2, max_rows=10),
examples=NEWS_EXAMPLES,
# cache_examples=True,
)