Update README.md
Browse files
README.md
CHANGED
@@ -409,4 +409,132 @@ topic_model.get_topic_info()
|
|
409 |
|367|10|367_opioid_morphine_pain_nefopam|opioid,morphine,pain,nefopam,us,epidural,postoperative,intrathecal,analgesia,anesthesia|
|
410 |
|368|10|368_lps_macrophages_sepsis_mgmt|lps,macrophages,sepsis,mgmt,mice,cgas,bam15,ezh2,clp,null|
|
411 |
|
412 |
-
</details>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
|367|10|367_opioid_morphine_pain_nefopam|opioid,morphine,pain,nefopam,us,epidural,postoperative,intrathecal,analgesia,anesthesia|
|
410 |
|368|10|368_lps_macrophages_sepsis_mgmt|lps,macrophages,sepsis,mgmt,mice,cgas,bam15,ezh2,clp,null|
|
411 |
|
412 |
+
</details>
|
413 |
+
|
414 |
+
## Training Procedure
|
415 |
+
|
416 |
+
The model was trained as follows:
|
417 |
+
|
418 |
+
```py
|
419 |
+
from bertopic import BERTopic
|
420 |
+
|
421 |
+
from sentence_transformers import SentenceTransformer
|
422 |
+
|
423 |
+
from umap import UMAP
|
424 |
+
from hdbscan import HDBSCAN
|
425 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
426 |
+
from bertopic.representation import PartOfSpeech, KeyBERTInspired, MaximalMarginalRelevance, ZeroShotClassification
|
427 |
+
|
428 |
+
embedding_model = SentenceTransformer("all-mpnet-base-v2")
|
429 |
+
umap_model = UMAP(n_neighbors=25, n_components=5, min_dist=0.0, metric='cosine', random_state=42, verbose=True) #change n_neightbor, n_components, metric
|
430 |
+
hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True) #change min_cluster_size, min_samples
|
431 |
+
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=5)
|
432 |
+
|
433 |
+
|
434 |
+
representation_models = {
|
435 |
+
"POS": PartOfSpeech("en_core_web_lg"),
|
436 |
+
"KeyBERTInspired": KeyBERTInspired(),
|
437 |
+
"MMR": MaximalMarginalRelevance(diversity=0.3),
|
438 |
+
"KeyBERT + MMR": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)],
|
439 |
+
"Summarization": summarization, # Own Prompted Model as to Summarize.
|
440 |
+
}
|
441 |
+
|
442 |
+
topic_model = BERTopic(
|
443 |
+
language="english",
|
444 |
+
embedding_model=embedding_model,
|
445 |
+
umap_model=umap_model,
|
446 |
+
#hdbscan_model=hdbscan_model,
|
447 |
+
#vectorizer_model=vectorizer_model,
|
448 |
+
representation_model=representation_models,
|
449 |
+
verbose=True,
|
450 |
+
)
|
451 |
+
topics, probs = topic_model.fit_transform(docs)
|
452 |
+
```
|
453 |
+
|
454 |
+
## Create Own Representation Model
|
455 |
+
|
456 |
+
Using [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) for its lightweightness
|
457 |
+
|
458 |
+
### Defined Summarization
|
459 |
+
|
460 |
+
```py
|
461 |
+
import torch
|
462 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
463 |
+
|
464 |
+
torch.random.manual_seed(42)
|
465 |
+
|
466 |
+
summarization_model = AutoModelForCausalLM.from_pretrained(
|
467 |
+
"microsoft/Phi-3-mini-128k-instruct",
|
468 |
+
device_map="cuda",
|
469 |
+
torch_dtype="auto",
|
470 |
+
trust_remote_code=True,
|
471 |
+
)
|
472 |
+
summarization_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
|
473 |
+
|
474 |
+
def summarize_with_model(text):
|
475 |
+
question = f"""
|
476 |
+
I have a document of which abstract and title are given.
|
477 |
+
The following documents are a small but representative subset of all documents in the topic:
|
478 |
+
{text}
|
479 |
+
|
480 |
+
Based on the information above, please give a description topic in the following keyword format:
|
481 |
+
topic: <description>
|
482 |
+
"""
|
483 |
+
messages = [
|
484 |
+
{"role": "user", "content": question},
|
485 |
+
]
|
486 |
+
pipe = pipeline(
|
487 |
+
"text-generation",
|
488 |
+
model=summarization_model,
|
489 |
+
tokenizer=summarization_tokenizer,
|
490 |
+
)
|
491 |
+
generation_args = {
|
492 |
+
"max_new_tokens": 128,
|
493 |
+
"return_full_text": False,
|
494 |
+
"temperature": 0.0,
|
495 |
+
"do_sample": False,
|
496 |
+
}
|
497 |
+
output = pipe(messages, **generation_args)
|
498 |
+
return output[0]['generated_text']
|
499 |
+
|
500 |
+
```
|
501 |
+
|
502 |
+
Prompt Used,
|
503 |
+
|
504 |
+
```py
|
505 |
+
question = f"""
|
506 |
+
I have a document of which abstract and title are given.
|
507 |
+
The following documents are a small but representative subset of all documents in the topic:
|
508 |
+
{text}
|
509 |
+
|
510 |
+
Based on the information above, please give a description topic in the following keyword format:
|
511 |
+
topic: <description>
|
512 |
+
"""
|
513 |
+
```
|
514 |
+
|
515 |
+
**NOTE: Persuation with other better propmt is recommended**
|
516 |
+
|
517 |
+
### Mounted on Base-Representation
|
518 |
+
|
519 |
+
```py
|
520 |
+
from bertopic.representation._base import BaseRepresentation
|
521 |
+
from typing import List, Mapping, Tuple
|
522 |
+
|
523 |
+
class SummarizationRepresentation(BaseRepresentation):
|
524 |
+
def __init__(self, summarization_model, summarization_tokenizer):
|
525 |
+
self.summarization_model = summarization_model
|
526 |
+
self.summarization_tokenizer = summarization_tokenizer
|
527 |
+
|
528 |
+
def extract_topics(self, topic_model, documents, c_tf_idf, topics
|
529 |
+
) -> Mapping[str, List[Tuple[str, float]]]:
|
530 |
+
updated_topics = {}
|
531 |
+
for topic_id, words in topics.items():
|
532 |
+
# Extract only the words from the tuples
|
533 |
+
words_only = [word[0] for word in words]
|
534 |
+
text = " ".join(words_only)
|
535 |
+
summary = summarize_with_model(text)
|
536 |
+
updated_topics[topic_id] = [(summary, 1.0)]
|
537 |
+
return updated_topics
|
538 |
+
|
539 |
+
summarization = SummarizationRepresentation(summarization_model, summarization_tokenizer)
|
540 |
+
```
|