Spaces:

ahdsoft
/

persian-keyphrase-extraction

Runtime error

App Files Files Community

mrmft commited on Aug 3, 2023

Commit

4da642e

•

1 Parent(s): 39d7a1a

adding project source

Browse files

Files changed (14) hide show

Dockerfile +26 -0
app.py +202 -0
docker-compose.yml +10 -0
functionforDownloadButtons.py +171 -0
kpe.py +67 -0
kpe_ranker.py +24 -0
labeling.py +125 -0
logo.png +0 -0
main.py +71 -0
ner_data_construction.py +70 -0
predict.py +14 -0
ranker.py +28 -0
requirements.txt +9 -0
utils.py +63 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.9
+RUN mkdir /app
+WORKDIR /app
+# download model and put in trained_model folder
+# RUN wget https://drive.ahdsoft.dev/s/xp5Mb7bQ34Z7BRX/download/trained_model_10000.pt
+# RUN mkdir trained_model
+# RUN mv trained_model_10000.pt trained_model/
+# download packages
+COPY requirements.txt .
+ENV HTTP_PROXY http://172.17.0.1:10805
+ENV HTTPS_PROXY http://172.17.0.1:10805
+ENV http_proxy http://172.17.0.1:10805
+ENV https_proxy http://172.17.0.1:10805
+RUN pip install git+https://github.com/mohammadkarrabi/NERDA.git
+RUN pip install -r requirements.txt
+RUN pip install sentence_transformers
+COPY . .
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=7201", "--server.address=0.0.0.0", "--client.showErrorDetails=false"]

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import streamlit as st
+import numpy as np
+from pandas import DataFrame
+# from keybert import KeyBERT
+# For Flair (Keybert)
+# from flair.embeddings import TransformerDocumentEmbeddings
+import seaborn as sns
+# For download buttons
+from functionforDownloadButtons import download_button
+import os
+import json
+from kpe_ranker import KpeRanker
+st.set_page_config(
+    page_title="استخراج عبارات کلیدی عهد",
+    page_icon="🎈",
+)
+def _max_width_():
+    max_width_str = f"max-width: 1400px;"
+    st.markdown(
+        f"""
+    <style>
+    .reportview-container .main .block-container{{
+        {max_width_str}
+    }}
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+_max_width_()
+c30, c31, c32 = st.columns([2.5, 1, 3])
+with c30:
+    # st.image("logo.png", width=400)
+    st.title("🔑 استخراج عبارات کلیدی")
+    st.header("")
+with st.expander("ℹ️ - About this app", expanded=True):
+    st.write(
+        """
+-   استخراج عبارات کلیدی، محصولی نوین از شرکت عهد است که در ارزیابی‌های صورت‌گرفته، دقت بیشتری را نسبت به رقبا از خود نشان داده است.
+	    """
+    )
+    st.markdown("")
+st.markdown("")
+# st.markdown("## **...**")
+with st.form(key="my_form"):
+    ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 5, 0.07])
+    with c1:
+        # if ModelType == "Default (DistilBERT)":
+            # kw_model = KeyBERT(model=roberta)
+        @st.cache(allow_output_mutation=True)
+        def load_model():
+            return KpeRanker()
+        kpe_ranker_extractor = load_model()
+        # else:
+        #     @st.cache(allow_output_mutation=True)
+        #     def load_model():
+        #         return KeyBERT("distilbert-base-nli-mean-tokens")
+            # kw_model = load_model()
+        top_N = st.slider(
+            "# تعداد",
+            min_value=1,
+            max_value=30,
+            value=10,
+            help="You can choose the number of keywords/keyphrases to display. Between 1 and 30, default number is 10.",
+        )
+#         min_Ngrams = st.number_input(
+#             "Minimum Ngram",
+#             min_value=1,
+#             max_value=4,
+#             help="""The minimum value for the ngram range.
+# *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases.
+# To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""",
+#             # help="Minimum value for the keyphrase_ngram_range. keyphrase_ngram_range sets the length of the resulting keywords/keyphrases. To extract keyphrases, simply set keyphrase_ngram_range to (1, # 2) or higher depending on the number of words you would like in the resulting keyphrases.",
+#         )
+#         max_Ngrams = st.number_input(
+#             "Maximum Ngram",
+#             value=2,
+#             min_value=1,
+#             max_value=4,
+#             help="""The maximum value for the keyphrase_ngram_range.
+# *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases.
+# To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""",
+#         )
+#         StopWordsCheckbox = st.checkbox(
+#             "Remove stop words",
+#             help="Tick this box to remove stop words from the document (currently English only)",
+#         )
+        use_ner = st.checkbox(
+            "NER",
+            value=True,
+            help="استفاده از شناسایی موجودیت‌های نام‌دار"        )
+    with c2:
+        doc = st.text_area(
+            "متن خود را وارد کنید",
+            height=510,
+        )
+        MAX_WORDS = 500
+        import re
+        res = len(re.findall(r"\w+", doc))
+        if res > MAX_WORDS:
+            st.warning(
+                "⚠️ Your text contains "
+                + str(res)
+                + " words."
+                + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
+            )
+            doc = doc[:MAX_WORDS]
+        submit_button = st.form_submit_button(label="✨ پردازش")
+if not submit_button:
+    st.stop()
+#################################### get keyphrases #######################################################
+keywords = kpe_ranker_extractor.extract(text=doc, count=top_N, using_ner=use_ner, return_sorted=True)
+# print(keywords)
+st.markdown("## **🎈 Check & download results **")
+st.header("")
+cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
+with c1:
+    CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)")
+with c2:
+    CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)")
+with c3:
+    CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)")
+st.header("")
+df = (
+    DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
+    .sort_values(by="Relevancy", ascending=False)
+    .reset_index(drop=True)
+)
+df.index += 1
+# Add styling
+cmGreen = sns.light_palette("green", as_cmap=True)
+cmRed = sns.light_palette("red", as_cmap=True)
+df = df.style.background_gradient(
+    cmap=cmGreen,
+    subset=[
+        "Relevancy",
+    ],
+)
+c1, c2, c3 = st.columns([1, 3, 1])
+format_dictionary = {
+    "Relevancy": "{:.1%}",
+}
+df = df.format(format_dictionary)
+with c2:
+    st.table(df)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,10 @@

+version: "3.8"
+services:
+  kpe:
+    build: .
+    ports:
+      - "7201:7201"
+    volumes:
+      - "/home/dev/ml_models/kpe:/app/trained_model"

functionforDownloadButtons.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import streamlit as st
+import pickle
+import pandas as pd
+import json
+import base64
+import uuid
+import re
+import importlib.util
+def import_from_file(module_name: str, filepath: str):
+    """
+    Imports a module from file.
+    Args:
+        module_name (str): Assigned to the module's __name__ parameter (does not
+            influence how the module is named outside of this function)
+        filepath (str): Path to the .py file
+    Returns:
+        The module
+    """
+    spec = importlib.util.spec_from_file_location(module_name, filepath)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+def notebook_header(text):
+    """
+    Insert section header into a jinja file, formatted as notebook cell.
+    Leave 2 blank lines before the header.
+    """
+    return f"""# # {text}
+"""
+def code_header(text):
+    """
+    Insert section header into a jinja file, formatted as Python comment.
+    Leave 2 blank lines before the header.
+    """
+    seperator_len = (75 - len(text)) / 2
+    seperator_len_left = math.floor(seperator_len)
+    seperator_len_right = math.ceil(seperator_len)
+    return f"# {'-' * seperator_len_left} {text} {'-' * seperator_len_right}"
+def to_notebook(code):
+    """Converts Python code to Jupyter notebook format."""
+    notebook = jupytext.reads(code, fmt="py")
+    return jupytext.writes(notebook, fmt="ipynb")
+def open_link(url, new_tab=True):
+    """Dirty hack to open a new web page with a streamlit button."""
+    # From: https://discuss.streamlit.io/t/how-to-link-a-button-to-a-webpage/1661/3
+    if new_tab:
+        js = f"window.open('{url}')"  # New tab or window
+    else:
+        js = f"window.location.href = '{url}'"  # Current tab
+    html = '<img src onerror="{}">'.format(js)
+    div = Div(text=html)
+    st.bokeh_chart(div)
+def download_button(object_to_download, download_filename, button_text):
+    """
+    Generates a link to download the given object_to_download.
+    From: https://discuss.streamlit.io/t/a-download-button-with-custom-css/4220
+    Params:
+    ------
+    object_to_download:  The object to be downloaded.
+    download_filename (str): filename and extension of file. e.g. mydata.csv,
+    some_txt_output.txt download_link_text (str): Text to display for download
+    link.
+    button_text (str): Text to display on download button (e.g. 'click here to download file')
+    pickle_it (bool): If True, pickle file.
+    Returns:
+    -------
+    (str): the anchor tag to download object_to_download
+    Examples:
+    --------
+    download_link(your_df, 'YOUR_DF.csv', 'Click to download data!')
+    download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!')
+    """
+    # if pickle_it:
+    #    try:
+    #        object_to_download = pickle.dumps(object_to_download)
+    #    except pickle.PicklingError as e:
+    #        st.write(e)
+    #        return None
+    # if:
+    if isinstance(object_to_download, bytes):
+        pass
+    elif isinstance(object_to_download, pd.DataFrame):
+        object_to_download = object_to_download.to_csv(index=False)
+    # Try JSON encode for everything else
+    else:
+        object_to_download = json.dumps(object_to_download)
+    try:
+        # some strings <-> bytes conversions necessary here
+        b64 = base64.b64encode(object_to_download.encode()).decode()
+    except AttributeError as e:
+        b64 = base64.b64encode(object_to_download).decode()
+    button_uuid = str(uuid.uuid4()).replace("-", "")
+    button_id = re.sub("\d+", "", button_uuid)
+    custom_css = f"""
+        <style>
+            #{button_id} {{
+                display: inline-flex;
+                align-items: center;
+                justify-content: center;
+                background-color: rgb(255, 255, 255);
+                color: rgb(38, 39, 48);
+                padding: .25rem .75rem;
+                position: relative;
+                text-decoration: none;
+                border-radius: 4px;
+                border-width: 1px;
+                border-style: solid;
+                border-color: rgb(230, 234, 241);
+                border-image: initial;
+            }}
+            #{button_id}:hover {{
+                border-color: rgb(246, 51, 102);
+                color: rgb(246, 51, 102);
+            }}
+            #{button_id}:active {{
+                box-shadow: none;
+                background-color: rgb(246, 51, 102);
+                color: white;
+                }}
+        </style> """
+    dl_link = (
+        custom_css
+        + f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br><br>'
+    )
+    # dl_link = f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}"><input type="button" kind="primary" value="{button_text}"></a><br></br>'
+    st.markdown(dl_link, unsafe_allow_html=True)
+# def download_link(
+#     content, label="Download", filename="file.txt", mimetype="text/plain"
+# ):
+#     """Create a HTML link to download a string as a file."""
+#     # From: https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806/9
+#     b64 = base64.b64encode(
+#         content.encode()
+#     ).decode()  # some strings <-> bytes conversions necessary here
+#     href = (
+#         f'<a href="data:{mimetype};base64,{b64}" download="{filename}">{label}</a>'
+#     )
+#     return href

kpe.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from flair.data import Sentence
+from flair.models import SequenceTagger
+from NERDA.models import NERDA
+from hazm import word_tokenize
+import flair
+import utils
+class KPE:
+    def __init__(self, trained_kpe_model, flair_ner_model, device='cpu') -> None:
+        self.extractor_model = NERDA(
+              tag_scheme = ['B-KEYWORD', 'I-KEYWORD'],
+              tag_outside = 'O',
+              transformer = 'xlm-roberta-large',
+              max_len=512,
+              device=device)
+        flair.device = device
+        self.extractor_model.load_network_from_file(trained_kpe_model)
+        self.ner_tagger = SequenceTagger.load(flair_ner_model)
+        self.IGNORE_TAGS = {'ORDINAL', 'DATE', 'CARDINAL'}
+    @staticmethod
+    def combine_keywords_nes(init_keywords, nes):
+        # init_keywords = list(set(init_keywords))
+        nes = list(set(nes))
+        print('nes before combined ', nes)
+        combined_keywords = []
+        for kw in init_keywords:
+            matched_index = utils.fuzzy_subword_match(kw, nes)
+            if matched_index != -1:
+                print(kw, nes[matched_index])
+                combined_keywords.append(nes[matched_index])
+                del nes[matched_index]
+            else:
+                combined_keywords.append(kw)
+        print('nes after combined ', nes)
+        combined_keywords.extend([n for n in nes if n not in combined_keywords])
+        return combined_keywords
+    def extract(self, txt, using_ner=True):
+        sentence = Sentence(txt)
+        # predict NER tags
+        if using_ner:
+            self.ner_tagger.predict(sentence)
+            nes = [entity.text for entity in sentence.get_spans('ner') if entity.tag not in self.IGNORE_TAGS]
+        else:
+            nes = []
+        #remove puncs
+        nes = list(map(utils.remove_puncs, nes))
+        print('nes ', nes)
+        sentences, tags_conf = self.extractor_model.predict_text(txt,  sent_tokenize=lambda txt: [txt], word_tokenize=lambda txt: txt.split(),    return_confidence=True)
+        init_keywords = utils.get_ne_from_iob_output(sentences, tags_conf)
+        init_keywords = list(map(utils.remove_puncs, init_keywords))
+        print('init keywords : ', init_keywords)
+        # combine ner response and init keywords
+        merged_keywords = self.combine_keywords_nes(init_keywords, nes)
+        #set but keep order
+        final_keywords = []
+        for kw in merged_keywords:
+            if kw not in final_keywords:
+                final_keywords.append(kw)
+        return final_keywords

kpe_ranker.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from kpe import KPE
+import utils
+import os
+from sentence_transformers import SentenceTransformer
+import ranker
+class KpeRanker:
+    def __init__(self):
+        TRAINED_MODEL_ADDR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'trained_model', 'trained_model_10000.pt')
+        self.kpe = KPE(trained_kpe_model= TRAINED_MODEL_ADDR, flair_ner_model='flair/ner-english-ontonotes-large', device='cpu')
+        self.ranker_transformer = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2',  device='cpu')
+    def extract(self, text, count, using_ner, return_sorted):
+        text = utils.normalize(text)
+        kps = self.kpe.extract(text, using_ner=using_ner)
+        if return_sorted:
+            kps = ranker.get_sorted_keywords(self.ranker_transformer, text, kps)
+        else:
+            kps = [(kp, 1) for kp in kps]
+        if len(kps) > count:
+            kps = kps[:count]
+        return kps

labeling.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import jsonlines
+import json
+from tqdm import tqdm
+import time
+from openai import error as openai_error
+import pandas as pd
+import openai
+import time
+import tiktoken
+import os
+import glob
+GPT_MODEL = 'gpt-3.5-turbo'
+GPT_TOKEN_LIMIT = 1500
+os.environ["OPENAI_API_KEY"] = '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'
+# os.environ["OPENAI_API_KEY"] = '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'
+openai.api_key = os.environ["OPENAI_API_KEY"]
+LAST_INDEX_FILE_ADDR = 'last_index.txt'
+TOKEN_COUNT_FILE_ADDR = 'tikitoken_count.txt'
+def num_tokens(text: str, model: str = GPT_MODEL) -> int:
+    """Return the number of tokens in a string."""
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(text))
+def extract_seen_ids():
+    seen_ids = set()
+    for tagged_data_addr in glob.iglob('./tagged_data*'):
+      seen_ids.update([json.loads(line)['id'] for line in open(tagged_data_addr)])
+    return seen_ids
+def get_keyphrase_by_gpt(document) -> str:
+  global error_count
+  # prompt = 'extract main keywords from below document as sorted list (sort by importance). you should not use numbers for counting them. you should generate less than 10 keywords.'
+  # prompt = 'Output only valid JSON list. Please extract the main keywords from the following document. The keywords should be in a comma-separated list, sorted by their importance. Do not use numbers to count the keywords. Try to generate less than 10 keywords.'
+  prompt = 'there is a popular NLP task named KPE (keyphrase Extraction). please extract keyphrases from below article as a perfect Persian KPE model. '
+  role_prompt  = 'return your answer using json list format'
+  message = prompt + '\n' + document
+  # message = prompt + '\n' + document
+  # message = document
+  messages = [
+      # {"role": "system", "content": "Output only valid JSON list"},
+      {"role": "system", "content": role_prompt},
+      {"role": "user", "content": message},
+  ]
+  try:
+    response = openai.ChatCompletion.create(
+      model=GPT_MODEL,
+      messages=messages,
+      temperature=0
+    )
+    response_message = response["choices"][0]["message"]["content"]
+    error_count = 0
+    return response_message
+  except Exception as e:
+    if error_count > 3:
+        raise e
+    error_count += 1
+    time.sleep(20)
+    return []
+#input_data = [json.load(line) for line in open('all_data.json').read().splitlines())
+#input_data = open('all_data.json')
+input_data = pd.read_csv('truncated_wiki_plus_shuffled_41203.csv')
+#print('len input data : ', len(input_data))
+try:
+   last_index = int(open(LAST_INDEX_FILE_ADDR).read())
+   print('load last index: ', last_index)
+except:
+  print('error in loading last index')
+  last_index = 0
+try:
+   token_count = int(open(TOKEN_COUNT_FILE_ADDR).read())
+   print('load token count: ', token_count)
+except:
+  print('error in loading token_count')
+  token_count = 0
+json_f_writer = jsonlines.open(f'tagged_data.jsonl_{str(last_index)}', mode='w')
+seen_ids = extract_seen_ids()
+for _, row_tup in enumerate(tqdm(input_data.iterrows(),total=len(input_data))):
+    index, row = row_tup
+    text = row['truncated_text_300']
+    id = row['id']
+    #filter by last index
+    if index < last_index:
+       print('skipping index: ', index)
+       continue
+    #filter by seen ids
+    if id in seen_ids:
+       print('repated id and skip')
+       continue
+    #filter by gpt max token
+    text_gpt_token_count = num_tokens(text, model=GPT_MODEL)
+    if  text_gpt_token_count > GPT_TOKEN_LIMIT:
+       continue
+    token_count += text_gpt_token_count
+    keyphrases = get_keyphrase_by_gpt(text)
+    try:
+        keyphrases = json.loads(keyphrases)
+        if type(keyphrases) != list:
+        # if type(keyphrases) == str:
+        #     keyphrases = keyphrases.split(',')
+        # else:
+            print(str(index), ': not a list!')
+    except:
+        print(str(index), ':invalid json!')
+    new_train_item = {'id': id, 'keyphrases':keyphrases}
+    json_f_writer.write(new_train_item)
+    last_index_f = open(LAST_INDEX_FILE_ADDR, 'w+')
+    last_index_f.write(str(index))
+    token_count_f = open(TOKEN_COUNT_FILE_ADDR, 'w+')
+    token_count_f.write(str(token_count))
+print(token_count)

logo.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import uvicorn
+import os
+from typing import Union
+from fastapi import FastAPI
+from kpe import KPE
+from fastapi.middleware.cors import CORSMiddleware
+# from fastapi.middleware.trustedhost import TrustedHostMiddleware
+from fastapi import APIRouter , Query
+from sentence_transformers import SentenceTransformer
+import utils
+from ranker import get_sorted_keywords
+from pydantic import BaseModel
+app = FastAPI(
+    title="AHD Persian KPE",
+    # version=config.settings.VERSION,
+    description="Keyphrase Extraction",
+    openapi_url="/openapi.json",
+    docs_url="/",
+)
+TRAINED_MODEL_ADDR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'trained_model', 'trained_model_10000.pt')
+kpe = KPE(trained_kpe_model= TRAINED_MODEL_ADDR, flair_ner_model='flair/ner-english-ontonotes-large', device='cpu')
+ranker_transformer = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2',  device='cpu')
+# Sets all CORS enabled origins
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  #str(origin) for origin in config.settings.BACKEND_CORS_ORIGINS
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class KpeParams(BaseModel):
+    text:str
+    count:int=10000
+    using_ner:bool=True
+    return_sorted:bool=False
+router = APIRouter()
+@router.get("/")
+def home():
+    return "Welcome to AHD Keyphrase Extraction Service"
+@router.post("/extract", description="extract keyphrase from persian documents")
+async def extract(kpe_params: KpeParams):
+    global kpe
+    text = utils.normalize(kpe_params.text)
+    kps = kpe.extract(text, using_ner=kpe_params.using_ner)
+    if kpe_params.return_sorted:
+        kps = get_sorted_keywords(ranker_transformer, text, kps)
+    else:
+        kps = [(kp, 1) for kp in kps]
+    if len(kps) > kpe_params.count:
+        kps = kps[:kpe_params.count]
+    return kps
+app.include_router(router)
+if __name__ == "__main__":
+    uvicorn.run("main:app",host="0.0.0.0", port=7201)

ner_data_construction.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import pandas as pd
+import json
+import glob
+def tag_document(keywords, tokens):
+    # Initialize the tags list with all O's
+    tags = ['O'] * len(tokens)
+    # Loop over the keywords and tag the document
+    for keyword in keywords:
+        # Split the keyword into words
+        keyword_words = keyword.split()
+        # Loop over the words in the document
+        for i in range(len(tokens)):
+            # If the current word matches the first word of the keyword
+            if tokens[i] == keyword_words[0]:
+                match = True
+                # Check if the rest of the words in the keyword match the following words in the document
+                for j in range(1, len(keyword_words)):
+                    if i+j >= len(tokens) or tokens[i+j] != keyword_words[j]:
+                        match = False
+                        break
+                # If all the words in the keyword match the following words in the document, tag them as B-KEYWORD and I-KEYWORD
+                if match:
+                    tags[i] = 'B-KEYWORD'
+                    for j in range(1, len(keyword_words)):
+                        tags[i+j] = 'I-KEYWORD'
+    return tags
+def create_tner_dataset(all_tags, all_tokens, output_file_addr):
+    output_f = open(output_file_addr, 'a+')
+    for tags, tokens in zip(all_tags, all_tokens):
+        for tag, tok in zip(tags, tokens):
+            line = '\t'.join([tok, tag])
+            output_f.write(line)
+            output_f.write('\n')
+        output_f.write('\n')
+if __name__ == '__main__':
+    data_df = pd.read_csv('truncated_wiki_plus_shuffled_41203.csv')
+    id2document = data_df.set_index('id')['truncated_text_300'].to_dict()
+    #tag documents!
+    print('------------------  tag documents --------------------')
+    all_tags = []
+    all_tokens = []
+    for tagged_data_addr in glob.iglob('./tagged_data*'):
+        for line in open(tagged_data_addr):
+            item = json.loads(line)
+            if type(item['keyphrases']) == list:
+                keywords = item['keyphrases']
+                document = id2document[item['id']]
+                tokens = document.split()
+                tags = tag_document(keywords, tokens)
+                assert len(tokens) == len(tags)
+                all_tags.append(tags)
+                all_tokens.append(tokens)
+                print(len(keywords), len(tags), len(document.split()), len([t for t in tags if t[0]== 'B']))
+    nerda_dataset = {'sentences':all_tokens, 'tags': all_tags}
+    with open('nerda_dataset.json', 'w+') as f:
+        json.dump(nerda_dataset, f)
+    # create_tner_dataset(all_tags, all_tokens, output_file_addr='./sample_train.conll')

predict.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import time
+from kpe import KPE
+import sys
+import os
+if __name__ == '__main__':
+    TRAINED_MODEL_ADDR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'trained_model_10000.pt')
+    text_addr = sys.argv[1]
+    text = open(text_addr).read()
+    kpe = KPE(trained_kpe_model= TRAINED_MODEL_ADDR, flair_ner_model='flair/ner-english-ontonotes-large', device='cpu')
+    s =time.time()
+    print(kpe.extract(text))
+    print(time.time() - s)

ranker.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+This is a simple application for sentence embeddings: semantic search
+We have a corpus with various sentences. Then, for a given query sentence,
+we want to find the most similar sentence in this corpus.
+This script outputs for various queries the top 5 most similar sentences in the corpus.
+"""
+from sentence_transformers import util
+import torch
+def get_sorted_keywords(embedder, text, keywords):
+    top_k = len(keywords)
+    keywords_embedding = embedder.encode(keywords, convert_to_tensor=True)
+    text_embedding =  embedder.encode(text, convert_to_tensor=True)
+    cos_scores = util.cos_sim(keywords_embedding, text_embedding).squeeze(dim=1)
+    # print(cos_scores.size())
+    top_results = torch.topk(cos_scores, k=top_k)
+    return [(keywords[idx], top_results[0][index].item()) for index, idx in enumerate(top_results[1])]
+    # return [keywords[idx] for idx in top_results[1]]
+    # for score, idx in zip(top_results[0], top_results[1]):
+    #     print(keywords[idx], "(Score: {:.4f})".format(score))

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn
+flair
+hazm
+parsinorm
+pydantic
+seaborn
+streamlit
+altair==4.2.2

utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from parsinorm import General_normalization
+import re
+def get_ne_from_iob_output(sentences, tags_conf):
+    sentences = sentences[0]
+    tags = tags_conf[0][0]
+    confs = tags_conf[1][0]
+    seen_b = False
+    keywords = {}
+    new_token = []
+    begin_index = 0
+    for index, (tok, tag) in enumerate(zip(sentences, tags)):
+        if tag[0] == 'I' and seen_b:
+                new_token.append(tok)
+        if tag[0] == 'B':
+                if new_token:
+                    keywords[' '.join(new_token)] =  confs[begin_index]
+                new_token = []
+                new_token.append(tok)
+                begin_index = index
+                seen_b = True
+        if tag[0] == 'O':
+             if new_token:
+                keywords[' '.join(new_token)] =  confs[begin_index]
+                new_token = []
+                seen_b = False
+    # print('keywords before sort: ', [k for k in keywords.keys])
+    #sort
+    sorted_keywords = sorted(list(keywords.keys()), key=lambda kw: keywords[kw], reverse=True)
+    print('keywords after sort: ', sorted_keywords)
+    return sorted_keywords
+def fuzzy_subword_match(key, words):
+    for index, w in enumerate(words):
+         if (len(key.split()) < len(w.split())) and key in w:
+              return index
+    return -1
+#normalize
+def normalize(txt):
+  general_normalization = General_normalization()
+  txt = general_normalization.alphabet_correction(txt)
+  txt = general_normalization.semi_space_correction(txt)
+  txt = general_normalization.english_correction(txt)
+  txt = general_normalization.html_correction(txt)
+  txt = general_normalization.arabic_correction(txt)
+  txt = general_normalization.punctuation_correction(txt)
+  txt = general_normalization.specials_chars(txt)
+  txt = general_normalization.remove_emojis(txt)
+  txt = general_normalization.number_correction(txt)
+  txt = general_normalization.remove_not_desired_chars(txt)
+  txt = general_normalization.remove_repeated_punctuation(txt)
+  return ' '.join(txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').split())
+def remove_puncs(txt):
+     return re.sub('[!?،\(\)\.]','', txt)