asoria HF staff commited on
Commit
10cefed
·
verified ·
1 Parent(s): 36d36ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -19,6 +19,7 @@ logging.basicConfig(
19
 
20
 
21
  session = requests.Session()
 
22
 
23
 
24
  def get_parquet_urls(dataset, config, split):
@@ -41,7 +42,7 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
41
 
42
 
43
  @spaces.GPU
44
- def calculate_embeddings(sentence_model, docs):
45
  embeddings = sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
46
  logging.info(f"Embeddings shape: {embeddings.shape}")
47
  return embeddings
@@ -91,11 +92,10 @@ def generate_topics(dataset, config, split, column, nested_column):
91
  # Create instances of GPU-accelerated UMAP and HDBSCAN
92
  # umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
93
  # hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)
94
- sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
95
  while True:
96
  docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
97
  logging.info(f"------------> New chunk data {offset=} {chunk_size=}")
98
- embeddings = calculate_embeddings(sentence_model, docs)
99
  offset = offset + chunk_size
100
  if not docs or offset >= limit:
101
  break
 
19
 
20
 
21
  session = requests.Session()
22
+ sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
23
 
24
 
25
  def get_parquet_urls(dataset, config, split):
 
42
 
43
 
44
  @spaces.GPU
45
+ def calculate_embeddings(docs):
46
  embeddings = sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
47
  logging.info(f"Embeddings shape: {embeddings.shape}")
48
  return embeddings
 
92
  # Create instances of GPU-accelerated UMAP and HDBSCAN
93
  # umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
94
  # hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)
 
95
  while True:
96
  docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
97
  logging.info(f"------------> New chunk data {offset=} {chunk_size=}")
98
+ embeddings = calculate_embeddings(docs)
99
  offset = offset + chunk_size
100
  if not docs or offset >= limit:
101
  break