File size: 2,603 Bytes
21b78eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from common_module import *
from string import punctuation
from sentence_transformers import SentenceTransformer
from umap import UMAP

# Loading NLTK Modules
from nltk.tokenize import sent_tokenize

class documentEmbedding:
    
  '''

  This class can be used online (in colab) or offline (locally):
  1. Online:
  If using this class in Colab and downloading the data from external source using the code
  in the notebook only run download_dataset function above in the code cell.
  2. Offline:
  If using this class to process news article data available in a local directory,
  "data_path" parameter should be defined.
  Where "data_path" is the path to the folder containing all news articles datasets
  datasets:

  Parameters:
  -----------

  data_path: str
  the path to 'all-the-news-2-1.csv' data if the data is downloaded from GDrive or other location.

  '''

  def __init__(self, data_path):
    # self.data = df
    self.sentence_model = SentenceTransformer("all-mpnet-base-v2")
    self.umap_reducer = pickle.load(open(os.path.join(data_path, "models/umap_reducer_model.sav"), 'rb'))

  def doc_clean(self, text):
    text = text.lower()
    text = text.replace('\xa0', '')
    text = re.sub('[!"#$%&\'()’*+,-/:;<=>?β€”@[\\]^_`{|}~’]', '', text)
    return text

  def sentence_to_vector(self, sent):
      # Encode the sentence
      embeddings = self.sentence_model.encode(sent, show_progress_bar=False)

      return embeddings
    
  def doc_to_vectors(self, doc):
      doc = self.doc_clean(str(doc))
      sentences  = sent_tokenize(doc)
      # sentence to vector representation
      vector = [self.sentence_to_vector(sent) for sent in sentences]
      doc_embd = np.multiply.reduce(vector)

      return doc_embd

  def generate_embedding(self, data_df):
    print("Generating embedding vectors ...")
    data_df['article_embd'] = data_df['article'].apply(self.doc_to_vectors)
    df_vec = pd.DataFrame([list(emb) for emb in data_df['article_embd'].values])
    return df_vec

  def feature_reduction(self, embd_vector, umap_reducer=False):
    if umap_reducer:
      reducer = umap_reducer
      data_umap = reducer.transform(embd_vector)
    else:
      reducer =  UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
      reducer.fit(embd_vector)
      data_umap = reducer.transform(embd_vector)
    return pd.DataFrame(data_umap), reducer 

  def embedding_main(self, df):
    df_vec = self.generate_embedding(df)
    df_vec, _ = self.feature_reduction(df_vec,umap_reducer=self.umap_reducer)
    return df_vec