Browse files
@@ -1,147 +1,102 @@
1 |
2 |
from minivectordb.embedding_model import EmbeddingModel
3 |
4 |
5 |
6 |
7 |
import concurrent.futures
8 |
9 |
10 |
11 |
langdetect_model = fasttext.load_model('lid.176.ftz')
12 |
embedding_model = EmbeddingModel(onnx_model_cpu_core_count=
13 |
14 |
15 |
tokenizer = tiktoken.encoding_for_model("gpt-4")
16 |
17 |
def count_tokens_tiktoken(text):
18 |
return len(tokenizer.encode(text))
19 |
20 |
21 |
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
semantic_db = VectorDatabase()
52 |
ids = [i for i in range(len(non_stopword_words))]
53 |
metadata_dicts = [{"w": word} for word in non_stopword_words]
54 |
semantic_db.store_embeddings_batch(ids, non_stopword_embeddings, metadata_dicts)
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
high_priority_count = max(high_priority_count, 0) # Ensure it's not negative
68 |
high_priority_indices = ordered_indices[:high_priority_count]
69 |
70 |
71 |
72 |
73 |
remaining_remove = num_remove
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
if remaining_remove > 0:
84 |
lower_priority_indices = ordered_indices[high_priority_count:]
85 |
num_non_stop = min(remaining_remove, len(lower_priority_indices)) # Ensure we don't sample more than available
86 |
prioritized_non_stop_indices = random.sample(lower_priority_indices, num_non_stop) if num_non_stop > 0 else []
87 |
88 |
89 |
90 |
stop_comb = random.sample(stopword_indices, num_stop) if num_stop > 0 else []
91 |
combination = set(stop_comb + prioritized_non_stop_indices)
92 |
93 |
new_string = [word for i, word in enumerate(words) if i not in combination or i in high_priority_indices]
94 |
combinations.append(' '.join(new_string))
95 |
96 |
return list(set(combinations))
97 |
98 |
99 |
def extract_embeddings(text):
100 |
return embedding_model.extract_embeddings(text)
101 |
102 |
def extract_embeddings_batch(texts):
103 |
return [extract_embeddings(text) for text in texts]
104 |
105 |
106 |
107 |
108 |
word_count = len(input_text.split())
109 |
110 |
thresholds = [(1500, 80), (1000, 90), (700, 110), (500, 130), (250, 160)]
111 |
for threshold, value in thresholds:
112 |
if word_count > threshold:
113 |
num_samples = value
114 |
115 |
116 |
semantic_embeddings = extract_embeddings(input_text)
117 |
text_lang = detect_language_en_pt(input_text)
118 |
stopwords = en_stop_words if text_lang == 'en' else pt_stop_words
119 |
text_combinations = generate_combinations(input_text, word_reduction_factor, stopwords, semantic_embeddings, num_samples=num_samples)
120 |
121 |
n = int(num_samples / cpu_count())
122 |
# Aggregate text_combinations into blocks of "n"
123 |
text_combinations_chunks = [text_combinations[i:i + n] for i in range(0, len(text_combinations), n)]
124 |
125 |
# Calculate the embeddings for each combination
126 |
combinations_embeddings = []
127 |
with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor:
128 |
for embeddings in, text_combinations_chunks):
129 |
130 |
131 |
semantic_db = VectorDatabase()
132 |
unique_ids = [ i for i in range(len(text_combinations)) ]
133 |
metadata_dicts = [ {"text": text} for text in text_combinations ]
134 |
semantic_db.store_embeddings_batch(unique_ids, combinations_embeddings, metadata_dicts)
135 |
136 |
_, _, result = semantic_db.find_most_similar(semantic_embeddings, k=1)
137 |
best_compressed_sentence = result[0]['text']
138 |
return best_compressed_sentence
139 |
140 |
async def predict(text, word_reduction_factor):
141 |
if len(text.split()) > 700:
142 |
return "Text is too long for this demo. Please provide a text with less than 700 words."
143 |
144 |
compressed =
145 |
perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
146 |
147 |
return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
@@ -162,7 +117,7 @@ reduction_factor = gr.Slider(
162 |
163 |
164 |
165 |
166 |
167 |
# Create the gradio interface
168 |
1 |
from sklearn.feature_extraction.text import CountVectorizer
2 |
from sklearn.decomposition import LatentDirichletAllocation
3 |
from minivectordb.embedding_model import EmbeddingModel
4 |
from sklearn.metrics.pairwise import cosine_similarity
5 |
import tiktoken, nltk, numpy as np, fasttext, pickle
6 |
from nltk.tokenize import sent_tokenize
7 |
import gradio as gr
8 |
9 |
10 |
11 |
12 |
langdetect_model = fasttext.load_model('lid.176.ftz')
13 |
embedding_model = EmbeddingModel(onnx_model_cpu_core_count=2)
14 |
english_stopwords = pickle.load(open("en_stopwords.pkl", "rb"))
15 |
portuguese_stopwords = pickle.load(open("pt_stopwords.pkl", "rb"))
16 |
tokenizer = tiktoken.encoding_for_model("gpt-4")
17 |
18 |
def count_tokens_tiktoken(text):
19 |
return len(tokenizer.encode(text))
20 |
21 |
def detect_language(text):
22 |
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
23 |
return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
24 |
25 |
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
26 |
def calculate_similarity(embed1, embed2):
27 |
return cosine_similarity([embed1], [embed2])[0][0]
28 |
29 |
def create_lda_model(texts, stopwords):
30 |
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
31 |
doc_term_matrix = vectorizer.fit_transform(texts)
32 |
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
33 |
34 |
return lda, vectorizer
35 |
36 |
def get_topic_distribution(text, lda, vectorizer):
37 |
vec = vectorizer.transform([text])
38 |
return lda.transform(vec)[0]
39 |
40 |
def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
41 |
sentence_embedding = embedding_model.extract_embeddings(sentence)
42 |
semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
43 |
44 |
topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
45 |
topic_importance = np.max(topic_dist)
46 |
47 |
# Calculate lexical diversity
48 |
words = sentence.split()
49 |
unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
50 |
lexical_diversity = len(unique_words) / len(words) if words else 0
51 |
52 |
# Combine factors (you can adjust weights as needed)
53 |
importance = (0.4 * semantic_similarity) + (0.4 * topic_importance) + (0.2 * lexical_diversity)
54 |
return importance
55 |
56 |
# Split the text into sentences
57 |
sentences = sent_tokenize(full_text)
58 |
59 |
text_lang = detect_language(full_text)
60 |
61 |
# Create LDA model
62 |
lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
63 |
64 |
# Get document-level embedding
65 |
doc_embedding = embedding_model.extract_embeddings(full_text)
66 |
67 |
# Calculate importance for each sentence
68 |
sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
69 |
for sentence in sentences]
70 |
71 |
# Sort sentences by importance
72 |
sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
73 |
74 |
# Determine how many words to keep
75 |
total_words = sum(len(sentence.split()) for sentence in sentences)
76 |
target_words = int(total_words * compression_rate)
77 |
78 |
# Reconstruct the compressed text
79 |
compressed_text = []
80 |
current_words = 0
81 |
for sentence, _ in sorted_sentences:
82 |
sentence_words = len(sentence.split())
83 |
if current_words + sentence_words <= target_words:
84 |
85 |
current_words += sentence_words
86 |
87 |
88 |
89 |
# Reorder sentences to maintain original flow
90 |
compressed_text.sort(key=lambda x: sentences.index(x))
91 |
92 |
return ' '.join(compressed_text)
93 |
94 |
95 |
async def predict(text, word_reduction_factor):
96 |
if len(text.split()) > 700:
97 |
return "Text is too long for this demo. Please provide a text with less than 700 words."
98 |
99 |
compressed = semantic_compress_text(text, word_reduction_factor = 1 - word_reduction_factor)
100 |
perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
101 |
102 |
return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
117 |
118 |
119 |
120 |
label="Reduction Factor"
121 |
122 |
# Create the gradio interface
123 |