Tsunnami
/

BERTopic-Phi-3

Text Classification

BERTopic

English

Model card Files Files and versions Community

Tsunnami commited on May 7

Commit

2bceb6f

•

1 Parent(s): e9a4955

Update README.md

Browse files

Files changed (1) hide show

README.md +129 -1

README.md CHANGED Viewed

@@ -409,4 +409,132 @@ topic_model.get_topic_info()
 |367|10|367_opioid_morphine_pain_nefopam|opioid,morphine,pain,nefopam,us,epidural,postoperative,intrathecal,analgesia,anesthesia|
 |368|10|368_lps_macrophages_sepsis_mgmt|lps,macrophages,sepsis,mgmt,mice,cgas,bam15,ezh2,clp,null|
-</details>

 |367|10|367_opioid_morphine_pain_nefopam|opioid,morphine,pain,nefopam,us,epidural,postoperative,intrathecal,analgesia,anesthesia|
 |368|10|368_lps_macrophages_sepsis_mgmt|lps,macrophages,sepsis,mgmt,mice,cgas,bam15,ezh2,clp,null|
+</details>
+## Training Procedure
+The model was trained as follows:
+```py
+from bertopic import BERTopic
+from sentence_transformers import SentenceTransformer
+from umap import UMAP
+from hdbscan import HDBSCAN
+from sklearn.feature_extraction.text import CountVectorizer
+from bertopic.representation import PartOfSpeech, KeyBERTInspired, MaximalMarginalRelevance, ZeroShotClassification
+embedding_model = SentenceTransformer("all-mpnet-base-v2")
+umap_model = UMAP(n_neighbors=25, n_components=5, min_dist=0.0, metric='cosine', random_state=42, verbose=True) #change n_neightbor, n_components, metric
+hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True) #change min_cluster_size, min_samples
+vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=5)
+representation_models = {
+    "POS": PartOfSpeech("en_core_web_lg"),
+    "KeyBERTInspired": KeyBERTInspired(),
+    "MMR": MaximalMarginalRelevance(diversity=0.3),
+    "KeyBERT + MMR": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)],
+    "Summarization": summarization, # Own Prompted Model as to Summarize.
+}
+topic_model = BERTopic(
+    language="english",
+    embedding_model=embedding_model,
+    umap_model=umap_model,
+    #hdbscan_model=hdbscan_model,
+    #vectorizer_model=vectorizer_model,
+    representation_model=representation_models,
+    verbose=True,
+)
+topics, probs = topic_model.fit_transform(docs)
+```
+## Create Own Representation Model
+Using [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) for its lightweightness
+### Defined Summarization
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+torch.random.manual_seed(42)
+summarization_model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Phi-3-mini-128k-instruct",
+    device_map="cuda",
+    torch_dtype="auto",
+    trust_remote_code=True,
+)
+summarization_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
+def summarize_with_model(text):
+    question = f"""
+        I have a document of which abstract and title are given.
+        The following documents are a small but representative subset of all documents in the topic:
+        {text}
+        Based on the information above, please give a description topic in the following keyword format:
+        topic: <description>
+    """
+    messages = [
+        {"role": "user", "content": question},
+    ]
+    pipe = pipeline(
+        "text-generation",
+        model=summarization_model,
+        tokenizer=summarization_tokenizer,
+    )
+    generation_args = {
+        "max_new_tokens": 128,
+        "return_full_text": False,
+        "temperature": 0.0,
+        "do_sample": False,
+    }
+    output = pipe(messages, **generation_args)
+    return output[0]['generated_text']
+```
+Prompt Used,
+```py
+question = f"""
+    I have a document of which abstract and title are given.
+    The following documents are a small but representative subset of all documents in the topic:
+    {text}
+    Based on the information above, please give a description topic in the following keyword format:
+    topic: <description>
+"""
+```
+**NOTE: Persuation with other better propmt is recommended**
+### Mounted on Base-Representation
+```py
+from bertopic.representation._base import BaseRepresentation
+from typing import List, Mapping, Tuple
+class SummarizationRepresentation(BaseRepresentation):
+    def __init__(self, summarization_model, summarization_tokenizer):
+        self.summarization_model = summarization_model
+        self.summarization_tokenizer = summarization_tokenizer
+    def extract_topics(self, topic_model, documents, c_tf_idf, topics
+                      ) -> Mapping[str, List[Tuple[str, float]]]:
+        updated_topics = {}
+        for topic_id, words in topics.items():
+            # Extract only the words from the tuples
+            words_only = [word[0] for word in words]
+            text = " ".join(words_only)
+            summary = summarize_with_model(text)
+            updated_topics[topic_id] = [(summary, 1.0)]
+        return updated_topics
+summarization = SummarizationRepresentation(summarization_model, summarization_tokenizer)
+```