Mohammed Foud commited on
Commit
dc961fb
·
1 Parent(s): b3ed9e6

first commit

Browse files
Files changed (3) hide show
  1. Dockerfile +5 -0
  2. app.py +120 -0
  3. requirements.txt +2 -1
Dockerfile CHANGED
@@ -22,6 +22,11 @@ RUN pip install --no-cache-dir -r requirements.txt && \
22
  python -m textblob.download_corpora && \
23
  python -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet')"
24
 
 
 
 
 
 
25
  # Copy the rest of the application
26
  COPY . .
27
 
 
22
  python -m textblob.download_corpora && \
23
  python -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet')"
24
 
25
+ # Install additional system dependencies for sentence-transformers
26
+ RUN apt-get update && apt-get install -y \
27
+ build-essential \
28
+ && rm -rf /var/lib/apt/lists/*
29
+
30
  # Copy the rest of the application
31
  COPY . .
32
 
app.py CHANGED
@@ -13,6 +13,9 @@ from collections import defaultdict
13
  from tabulate import tabulate
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
  from sklearn.cluster import KMeans
 
 
 
16
 
17
  # Load models and initialize components
18
  model_path = "./final_model"
@@ -313,6 +316,123 @@ def add_clusters_to_df(df):
313
 
314
  return df
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  # Create and launch the interface
317
  if __name__ == "__main__":
318
  demo = create_interface()
 
13
  from tabulate import tabulate
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
  from sklearn.cluster import KMeans
16
+ from sentence_transformers import SentenceTransformer
17
+ from sklearn.decomposition import PCA
18
+ from collections import Counter
19
 
20
  # Load models and initialize components
21
  model_path = "./final_model"
 
316
 
317
  return df
318
 
319
+ def generate_category_summaries(df):
320
+ """Generate product summaries in table format"""
321
+ # First, ensure we have clusters
322
+ if 'cluster_name' not in df.columns:
323
+ df = create_clusters(df)
324
+
325
+ summaries = {}
326
+
327
+ for cluster_name in df['cluster_name'].unique():
328
+ cluster_df = df[df['cluster_name'] == cluster_name]
329
+
330
+ # Get top products by rating
331
+ top_products = cluster_df.groupby('name').agg({
332
+ 'reviews.rating': ['mean', 'count'],
333
+ 'reviews.text': list
334
+ }).reset_index()
335
+
336
+ top_products.columns = ['name', 'avg_rating', 'review_count', 'reviews']
337
+ top_products = top_products[top_products['review_count'] >= 5] # Min reviews threshold
338
+ top_products = top_products.sort_values('avg_rating', ascending=False)
339
+
340
+ if len(top_products) < 3:
341
+ continue
342
+
343
+ # Get top 3 and worst products
344
+ top_3 = top_products.head(3)
345
+ worst_product = top_products.tail(1)
346
+
347
+ # Analyze reviews for each product
348
+ product_details = []
349
+ for _, product in top_3.iterrows():
350
+ pros, cons = analyze_sentiment(product['reviews'])
351
+ product_details.append({
352
+ 'name': product['name'],
353
+ 'rating': product['avg_rating'],
354
+ 'review_count': product['review_count'],
355
+ 'pros': pros[:3] or ["No significant positive feedback"],
356
+ 'cons': cons[:3] or ["No major complaints"]
357
+ })
358
+
359
+ # Format tables
360
+ tables = []
361
+
362
+ # Top Products Table
363
+ top_table = []
364
+ for product in product_details:
365
+ top_table.append([
366
+ product['name'],
367
+ f"★{product['rating']:.1f}",
368
+ product['review_count'],
369
+ "\n".join(product['pros']),
370
+ "\n".join(product['cons'])
371
+ ])
372
+
373
+ tables.append({
374
+ 'section': f"TOP PRODUCTS IN {cluster_name.upper()}",
375
+ 'headers': ["Product", "Rating", "Reviews", "Pros", "Cons"],
376
+ 'data': top_table
377
+ })
378
+
379
+ # Worst Product Table
380
+ if not worst_product.empty:
381
+ worst = worst_product.iloc[0]
382
+ pros, cons = analyze_sentiment(worst['reviews'])
383
+ tables.append({
384
+ 'section': "PRODUCT TO AVOID",
385
+ 'headers': ["Product", "Rating", "Reasons to Avoid"],
386
+ 'data': [[
387
+ worst['name'],
388
+ f"★{worst['avg_rating']:.1f}",
389
+ ", ".join(cons[:3]) if cons else "Consistently poor ratings"
390
+ ]]
391
+ })
392
+
393
+ summaries[cluster_name] = tables
394
+
395
+ return summaries
396
+
397
+ def create_clusters(df):
398
+ """Create clusters from product data"""
399
+ # Prepare product data
400
+ products = df[['name', 'categories']].drop_duplicates()
401
+ product_texts = (products['name'] + " " + products['categories']).tolist()
402
+
403
+ # Create embeddings
404
+ model = SentenceTransformer('all-MiniLM-L6-v2')
405
+ embeddings = model.encode(product_texts, show_progress_bar=True)
406
+
407
+ # Perform clustering
408
+ num_clusters = 4
409
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
410
+ clusters = kmeans.fit_predict(embeddings)
411
+ products['cluster'] = clusters
412
+
413
+ # Generate cluster names
414
+ cluster_names = {}
415
+ for cluster_num in range(num_clusters):
416
+ cluster_df = products[products['cluster'] == cluster_num]
417
+
418
+ # Get descriptive words from product names
419
+ words = []
420
+ for name in cluster_df['name']:
421
+ words += name.lower().split()
422
+
423
+ # Get top words for cluster name
424
+ top_words = [word for word, count in Counter(words).most_common(10)
425
+ if len(word) > 3][:3]
426
+ label = ' '.join(top_words)
427
+ cluster_names[cluster_num] = label
428
+
429
+ # Map clusters to original dataframe
430
+ product_to_cluster = dict(zip(products['name'], products['cluster']))
431
+ df['cluster'] = df['name'].map(product_to_cluster)
432
+ df['cluster_name'] = df['cluster'].map(cluster_names)
433
+
434
+ return df
435
+
436
  # Create and launch the interface
437
  if __name__ == "__main__":
438
  demo = create_interface()
requirements.txt CHANGED
@@ -8,4 +8,5 @@ transformers>=4.30.0
8
  scikit-learn>=1.2.0
9
  textblob>=0.17.1
10
  tabulate>=0.9.0
11
- nltk>=3.8.1
 
 
8
  scikit-learn>=1.2.0
9
  textblob>=0.17.1
10
  tabulate>=0.9.0
11
+ nltk>=3.8.1
12
+ sentence-transformers>=2.2.0