Spaces:

mfoud444
/

oop

Build error

App Files Files Community

Mohammed Foud commited on 26 days ago

Commit

dc961fb

1 Parent(s): b3ed9e6

first commit

Browse files

Files changed (3) hide show

Dockerfile +5 -0
app.py +120 -0
requirements.txt +2 -1

Dockerfile CHANGED Viewed

@@ -22,6 +22,11 @@ RUN pip install --no-cache-dir -r requirements.txt && \
     python -m textblob.download_corpora && \
     python -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet')"
 # Copy the rest of the application
 COPY . .

     python -m textblob.download_corpora && \
     python -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet')"
+# Install additional system dependencies for sentence-transformers
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
 # Copy the rest of the application
 COPY . .

app.py CHANGED Viewed

@@ -13,6 +13,9 @@ from collections import defaultdict
 from tabulate import tabulate
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 # Load models and initialize components
 model_path = "./final_model"
@@ -313,6 +316,123 @@ def add_clusters_to_df(df):
     return df
 # Create and launch the interface
 if __name__ == "__main__":
     demo = create_interface()

 from tabulate import tabulate
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
+from sentence_transformers import SentenceTransformer
+from sklearn.decomposition import PCA
+from collections import Counter
 # Load models and initialize components
 model_path = "./final_model"
     return df
+def generate_category_summaries(df):
+    """Generate product summaries in table format"""
+    # First, ensure we have clusters
+    if 'cluster_name' not in df.columns:
+        df = create_clusters(df)
+    summaries = {}
+    for cluster_name in df['cluster_name'].unique():
+        cluster_df = df[df['cluster_name'] == cluster_name]
+        # Get top products by rating
+        top_products = cluster_df.groupby('name').agg({
+            'reviews.rating': ['mean', 'count'],
+            'reviews.text': list
+        }).reset_index()
+        top_products.columns = ['name', 'avg_rating', 'review_count', 'reviews']
+        top_products = top_products[top_products['review_count'] >= 5]  # Min reviews threshold
+        top_products = top_products.sort_values('avg_rating', ascending=False)
+        if len(top_products) < 3:
+            continue
+        # Get top 3 and worst products
+        top_3 = top_products.head(3)
+        worst_product = top_products.tail(1)
+        # Analyze reviews for each product
+        product_details = []
+        for _, product in top_3.iterrows():
+            pros, cons = analyze_sentiment(product['reviews'])
+            product_details.append({
+                'name': product['name'],
+                'rating': product['avg_rating'],
+                'review_count': product['review_count'],
+                'pros': pros[:3] or ["No significant positive feedback"],
+                'cons': cons[:3] or ["No major complaints"]
+            })
+        # Format tables
+        tables = []
+        # Top Products Table
+        top_table = []
+        for product in product_details:
+            top_table.append([
+                product['name'],
+                f"★{product['rating']:.1f}",
+                product['review_count'],
+                "\n".join(product['pros']),
+                "\n".join(product['cons'])
+            ])
+        tables.append({
+            'section': f"TOP PRODUCTS IN {cluster_name.upper()}",
+            'headers': ["Product", "Rating", "Reviews", "Pros", "Cons"],
+            'data': top_table
+        })
+        # Worst Product Table
+        if not worst_product.empty:
+            worst = worst_product.iloc[0]
+            pros, cons = analyze_sentiment(worst['reviews'])
+            tables.append({
+                'section': "PRODUCT TO AVOID",
+                'headers': ["Product", "Rating", "Reasons to Avoid"],
+                'data': [[
+                    worst['name'],
+                    f"★{worst['avg_rating']:.1f}",
+                    ", ".join(cons[:3]) if cons else "Consistently poor ratings"
+                ]]
+            })
+        summaries[cluster_name] = tables
+    return summaries
+def create_clusters(df):
+    """Create clusters from product data"""
+    # Prepare product data
+    products = df[['name', 'categories']].drop_duplicates()
+    product_texts = (products['name'] + " " + products['categories']).tolist()
+    # Create embeddings
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    embeddings = model.encode(product_texts, show_progress_bar=True)
+    # Perform clustering
+    num_clusters = 4
+    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+    clusters = kmeans.fit_predict(embeddings)
+    products['cluster'] = clusters
+    # Generate cluster names
+    cluster_names = {}
+    for cluster_num in range(num_clusters):
+        cluster_df = products[products['cluster'] == cluster_num]
+        # Get descriptive words from product names
+        words = []
+        for name in cluster_df['name']:
+            words += name.lower().split()
+        # Get top words for cluster name
+        top_words = [word for word, count in Counter(words).most_common(10)
+                    if len(word) > 3][:3]
+        label = ' '.join(top_words)
+        cluster_names[cluster_num] = label
+    # Map clusters to original dataframe
+    product_to_cluster = dict(zip(products['name'], products['cluster']))
+    df['cluster'] = df['name'].map(product_to_cluster)
+    df['cluster_name'] = df['cluster'].map(cluster_names)
+    return df
 # Create and launch the interface
 if __name__ == "__main__":
     demo = create_interface()

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ transformers>=4.30.0
 scikit-learn>=1.2.0
 textblob>=0.17.1
 tabulate>=0.9.0
-nltk>=3.8.1

 scikit-learn>=1.2.0
 textblob>=0.17.1
 tabulate>=0.9.0
+nltk>=3.8.1
+sentence-transformers>=2.2.0