Spaces:

KIMOSSINO
/

hashtags

Sleeping

App Files Files Community

KIMOSSINO commited on Dec 7, 2024

Commit

54fa405

verified ·

1 Parent(s): 3d4c5be

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -49

app.py CHANGED Viewed

@@ -1,15 +1,11 @@
-import gradio as gr
 from bs4 import BeautifulSoup
 import pandas as pd
 from collections import Counter
-# معالجة البيانات
-def process_file(file):
-    try:
-        # قراءة محتوى الملف
-        content = file.read().decode('utf-8')
-    except Exception as e:
-        return f"خطأ أثناء قراءة الملف: {str(e)}", None
     # تحليل HTML باستخدام BeautifulSoup
     soup = BeautifulSoup(content, 'html.parser')
@@ -18,62 +14,37 @@ def process_file(file):
     data = []
     hashtags_counter = Counter()
-    # العثور على الحاويات المستهدفة
     desc_containers = soup.find_all('div', class_="css-vi46v1-DivDesContainer")
-    if not desc_containers:
-        return "لم يتم العثور على أي بيانات مطابقة في الملف.", None
     for container in desc_containers:
         # استخراج العنوان
-        title_tag = container.find('h1')
         title = title_tag.get_text(strip=True) if title_tag else "بدون عنوان"
         # استخراج الهاشتاغات
         hashtags = [
             tag.get_text(strip=True)
-            for tag in container.find_all('a')
             if tag.get_text(strip=True).startswith('#')
         ]
         hashtags_counter.update(hashtags)
-        # إضافة البيانات للجدول
         data.append({"Title": title, "Hashtags": ", ".join(hashtags)})
-    # تحويل النتائج إلى DataFrame
-    df = pd.DataFrame(data)
-    hashtags_df = pd.DataFrame(hashtags_counter.items(), columns=["Hashtag", "Count"]).sort_values(by="Count", ascending=False)
-    return df, hashtags_df
-# واجهة Gradio
-def gradio_interface(file):
-    result = process_file(file)
-    if isinstance(result, tuple):
-        titles_df, hashtags_df = result
-    else:
-        return result, ""
-    if titles_df is None or hashtags_df is None:
-        return "لم يتم استخراج أي بيانات.", ""
-    # تحويل النتائج إلى HTML للعرض
-    titles_html = titles_df.to_html(index=False) if not titles_df.empty else "لا توجد عناوين مستخرجة."
-    hashtags_html = hashtags_df.to_html(index=False) if not hashtags_df.empty else "لا توجد هاشتاغات مستخرجة."
-    return titles_html, hashtags_html
-# إنشاء واجهة Gradio
-interface = gr.Interface(
-    fn=gradio_interface,
-    inputs=gr.File(label="ارفع ملف HTML"),
-    outputs=[
-        gr.HTML(label="العناوين والهاشتاغات المستخرجة"),
-        gr.HTML(label="الهاشتاغات مع عدد مرات تكرارها")
-    ],
-    title="استخراج العناوين والهاشتاغات",
-    description="ارفع ملف HTML لاستخراج العناوين والهاشتاغات مع عدد مرات تكرار كل هاشتاغ."
-)
-# تشغيل التطبيق
-interface.launch()

 from bs4 import BeautifulSoup
 import pandas as pd
 from collections import Counter
+def extract_data_from_html(file_path):
+    # قراءة محتوى الملف
+    with open(file_path, 'r', encoding='utf-8') as file:
+        content = file.read()
     # تحليل HTML باستخدام BeautifulSoup
     soup = BeautifulSoup(content, 'html.parser')
     data = []
     hashtags_counter = Counter()
+    # العثور على الحاويات التي تحتوي على البيانات
     desc_containers = soup.find_all('div', class_="css-vi46v1-DivDesContainer")
     for container in desc_containers:
         # استخراج العنوان
+        title_tag = container.find('h1', class_="css-198cw7i-H1Container")
         title = title_tag.get_text(strip=True) if title_tag else "بدون عنوان"
         # استخراج الهاشتاغات
         hashtags = [
             tag.get_text(strip=True)
+            for tag in container.find_all('a', class_="css-sbcvet-StyledCommonLink")
             if tag.get_text(strip=True).startswith('#')
         ]
         hashtags_counter.update(hashtags)
+        # إضافة البيانات إلى القائمة
         data.append({"Title": title, "Hashtags": ", ".join(hashtags)})
+    # تحويل البيانات إلى DataFrame
+    df_titles = pd.DataFrame(data)
+    df_hashtags = pd.DataFrame(hashtags_counter.items(), columns=["Hashtag", "Count"]).sort_values(by="Count", ascending=False)
+    return df_titles, df_hashtags
+# استخدام الدالة لتحليل الملف
+file_path = "/mnt/data/Spanish.txt"
+titles_df, hashtags_df = extract_data_from_html(file_path)
+# عرض النتائج
+print("العناوين والهاشتاغات المستخرجة:")
+print(titles_df)
+print("\nالهاشتاغات مع عدد مرات تكرارها:")
+print(hashtags_df)