File size: 4,977 Bytes
7189553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00399e0
7189553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f90fe6
7189553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from tabula import read_pdf
from bs4 import BeautifulSoup
import requests

from llama_cpp import Llama
from bertopic.representation import KeyBERTInspired, LlamaCPP
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

import PIL
import numpy as np
import datamapplot
import re

def get_links():
    #reads table from pdf file
    dfs = read_pdf("Artificial_Intelligence_Bookmarks_AwesomeList.pdf",pages="all") #upload pdf file
    links = dfs[0]['Unnamed: 2'].to_list()
    for i in range(len(dfs)-1):
      links.extend(dfs[i+1]['Url'].to_list())
    return links

#--------------------------------------
# text processing

def remove_tags(html):

    # parse html content
    soup = BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

#-------------------------------------

def get_page(link):
  try:
    #print(link)
    x = requests.get(link)
    raw_html = x.text
    clean_text = remove_tags(raw_html)[:1050]
    clean_text = remove_emoji(clean_text)
    return clean_text
  except:
    print(link)

def get_documents(links):
    pre_processed_text = [get_page(link) for link in links]
    while(None in pre_processed_text):
        pre_processed_text.remove(None)
    pre_processed_text = [i for i in pre_processed_text if len(i) > 999]
    return pre_processed_text

#----------------------------------------

def get_topics(docs):
    # Use llama.cpp to load in a Quantized LLM
    llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])
    
    prompt = """ Q:
    I have a topic that contains the following documents:
    [DOCUMENTS]
    
    The topic is described by the following keywords: '[KEYWORDS]'.
    
    Based on the above information, can you give a short label of the topic of at most 5 words?
    A:
    """
    
    representation_model = {
        "KeyBERT": KeyBERTInspired(),
        "LLM": Llam
    }
    
    # Pre-calculate embeddings
    embedding_model = SentenceTransformer("BAAI/bge-small-en")
    embeddings = embedding_model.encode(docs, show_progress_bar=True)
    
    # Pre-reduce embeddings for visualization purposes
    reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
    
    # Define sub-models
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    
    topic_model = BERTopic(
    
      # Sub-models
      embedding_model=embedding_model,
      umap_model=umap_model,
      hdbscan_model=hdbscan_model,
      representation_model=representation_model,
    
      # Hyperparameters
      top_n_words=10,
      verbose=True
    )
    
    # Train model
    topics, probs = topic_model.fit_transform(docs, embeddings)

    return topic_model

#-------------------------------
# Visualize Topics
def get_figure(topic_model):
    # Prepare logo
    bertopic_logo_response = requests.get(
        "https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png",
        stream=True,
        headers={'User-Agent': 'My User Agent 1.0'}
    )
    bertopic_logo = np.asarray(PIL.Image.open(bertopic_logo_response.raw))

    # Create a label for each document
    llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
    llm_labels = [label if label else "Unlabelled" for label in llm_labels]
    all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics]
    
    # Run the visualization
    fig = datamapplot.create_plot(
        reduced_embeddings,
        all_labels,
        label_font_size=11,
        title="ArXiv - BERTopic",
        sub_title="Topics labeled with `openhermes-2.5-mistral-7b`",
        label_wrap_width=20,
        use_medoids=True,
        logo=bertopic_logo,
        logo_width=0.16
    )

    return fig