isayahc commited on
Commit
848d541
1 Parent(s): 51a28d2

created a function for generating keywords for bodies of text

Browse files
config.py CHANGED
@@ -9,6 +9,7 @@ SQLITE_FILE_NAME = os.getenv('SOURCES_CACHE')
9
  PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
10
  EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
11
  SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
 
12
 
13
 
14
  db = DataBaseHandler()
 
9
  PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
10
  EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
11
  SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
12
+ BERT_MODEL = os.getenv("BERT_MODEL")
13
 
14
 
15
  db = DataBaseHandler()
example.env CHANGED
@@ -27,3 +27,4 @@ LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
27
  LLM_MODEL_ARGS=
28
 
29
  SEVEN_B_LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
 
 
27
  LLM_MODEL_ARGS=
28
 
29
  SEVEN_B_LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
30
+ BERT_MODEL="paraphrase-multilingual-MiniLM-L12-v2"
rag_app/utils/generate_keywords_bert.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Dict, Any
2
+ from keybert import KeyBERT
3
+
4
+ def extract_keywords_from_doc(
5
+ doc: str,
6
+ model_name: str = "paraphrase-multilingual-MiniLM-L12-v2",
7
+ **kwargs: Dict[str, Any]
8
+ ) -> List[Tuple[str, float]]:
9
+ """
10
+ ## Summary
11
+ Extract keywords from a document using the KeyBERT model.
12
+
13
+ ## Parameters:
14
+ doc (str): The document from which to extract keywords.
15
+ model_name (str): The name of the model to use. Default is "paraphrase-multilingual-MiniLM-L12-v2".
16
+ **kwargs (Dict[str, Any]): Additional keyword arguments for the extract_keywords method.
17
+ Possible keyword arguments include:
18
+ - top_n (int): The number of top keywords to return.
19
+ - keyphrase_ngram_range (Tuple[int, int]): The ngram range for the keyphrases.
20
+ - stop_words (str): The stop words to use.
21
+ - use_maxsum (bool): Whether to use Max Sum Similarity.
22
+ - use_mmr (bool): Whether to use Maximal Marginal Relevance.
23
+ - diversity (float): The diversity parameter for MMR.
24
+ - nr_candidates (int): The number of candidates for Max Sum Similarity.
25
+
26
+ ## Returns:
27
+ List[Tuple[str, float]]: A list of tuples containing keywords and their corresponding scores.
28
+
29
+ ## Example:
30
+ doc = \"\"\"
31
+ Supervised learning is the machine learning task of learning a function that
32
+ maps an input to an output based on example input-output pairs. It infers a
33
+ function from labeled training data consisting of a set of training examples.
34
+ In supervised learning, each example is a pair consisting of an input object
35
+ (typically a vector) and a desired output value (also called the supervisory signal).
36
+ A supervised learning algorithm analyzes the training data and produces an inferred function,
37
+ which can be used for mapping new examples. An optimal scenario will allow for the
38
+ algorithm to correctly determine the class labels for unseen instances. This requires
39
+ the learning algorithm to generalize from the training data to unseen situations in a
40
+ 'reasonable' way (see inductive bias).
41
+ \"\"\"
42
+
43
+ keywords = extract_keywords_from_doc(
44
+ doc,
45
+ top_n=10,
46
+ keyphrase_ngram_range=(1, 2),
47
+ stop_words='english',
48
+ use_maxsum=True,
49
+ nr_candidates=20
50
+ )
51
+ print(keywords)
52
+ """
53
+ kw_model = KeyBERT(model=model_name)
54
+ keywords = kw_model.extract_keywords(doc, **kwargs)
55
+ return keywords
56
+
57
+ if __name__ == "__main__":
58
+
59
+ # Example usage
60
+ doc = """
61
+ Supervised learning is the machine learning task of learning a function that
62
+ maps an input to an output based on example input-output pairs. It infers a
63
+ function from labeled training data consisting of a set of training examples.
64
+ In supervised learning, each example is a pair consisting of an input object
65
+ (typically a vector) and a desired output value (also called the supervisory signal).
66
+ A supervised learning algorithm analyzes the training data and produces an inferred function,
67
+ which can be used for mapping new examples. An optimal scenario will allow for the
68
+ algorithm to correctly determine the class labels for unseen instances. This requires
69
+ the learning algorithm to generalize from the training data to unseen situations in a
70
+ 'reasonable' way (see inductive bias).
71
+ """
72
+
73
+ # Example of passing additional keyword arguments
74
+ keywords = extract_keywords_from_doc(
75
+ doc,
76
+ top_n=10,
77
+ keyphrase_ngram_range=(1, 2),
78
+ stop_words='english',
79
+ use_maxsum=True,
80
+ nr_candidates=20
81
+ )
82
+ print(keywords)