kensvin commited on
Commit
286516b
·
verified ·
1 Parent(s): cd4d249

Update gradio text

Browse files
Files changed (1) hide show
  1. app.py +124 -125
app.py CHANGED
@@ -1,125 +1,124 @@
1
- !pip install transformers sentence_transformers langchain openai python-dotenv chromadb faiss-gpu
2
- import re
3
- from urllib.parse import urlparse, parse_qs
4
- import pandas as pd
5
- import unicodedata as uni
6
- import emoji
7
- from langchain.chat_models import ChatOpenAI
8
- from langchain.embeddings import HuggingFaceEmbeddings
9
- from langchain.document_loaders import DataFrameLoader
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- from langchain.vectorstores import FAISS
12
- from langchain.chains import RetrievalQA
13
- from tokopedia import request_product_id, request_product_review
14
- import gradio as gr
15
-
16
- shop_id = ""
17
- item_id = ""
18
- item = {}
19
- LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
20
-
21
- def scrape(URL, max_reviews=LIMIT):
22
- parsed_url = urlparse(URL)
23
- *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
24
- product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
25
- "basicInfo"
26
- ]["id"]
27
- all_reviews = []
28
- page = 1
29
- has_next = True
30
-
31
- while has_next and len(all_reviews) <= max_reviews:
32
- response = request_product_review(product_id, page=page)
33
- data = response.json()["data"]["productrevGetProductReviewList"]
34
- reviews = data["list"]
35
- all_reviews.extend(reviews)
36
- has_next = data["hasNext"]
37
- page += 1
38
-
39
- reviews_df = pd.json_normalize(all_reviews)
40
- return reviews_df
41
-
42
- # Clean
43
- def clean(df):
44
- df = df.dropna().copy().reset_index(drop=True) # drop reviews with empty comments
45
- df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
46
- df["comment"] = df["comment"].apply(lambda x: clean_text(x)) # clean text
47
- df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
48
- return df
49
-
50
-
51
- def clean_text(text):
52
- text = uni.normalize("NFKD", text) # normalise characters
53
- text = emoji.replace_emoji(text, "") # remove emoji
54
- text = re.sub(r"(\w)\1{2,}", r"\1", text) # repeated chars
55
- text = re.sub(r"[ ]+", " ", text).strip() # remove extra spaces
56
- return text
57
-
58
-
59
- # LLM
60
- OpenAIModel = "gpt-3.5-turbo"
61
- llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
62
-
63
- # Embeddings
64
- embeddings = HuggingFaceEmbeddings(model_name="Blaxzter/LaBSE-sentence-embeddings")
65
-
66
- cache_URL = ""
67
- db = None
68
- qa = None
69
-
70
-
71
- def generate(URL, query):
72
- global cache_URL, db, qa
73
- if URL != cache_URL:
74
- # Get reviews
75
- try:
76
- reviews = scrape(URL)
77
- # Clean reviews
78
- cleaned_reviews = clean(reviews)
79
- # Load data
80
- loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
81
- documents = loader.load()
82
- except Exception as e:
83
- return "Error getting reviews: " + str(e)
84
-
85
- # Split text
86
- text_splitter = RecursiveCharacterTextSplitter(
87
- chunk_size=1000, chunk_overlap=50
88
- )
89
- docs = text_splitter.split_documents(documents)
90
- cache_URL = URL
91
- # Vector store
92
- db = FAISS.from_documents(docs, embeddings)
93
- # Chain to answer questions
94
- qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
95
- return qa.run(query)
96
-
97
-
98
- # Gradio
99
- product_box = gr.Textbox(
100
- label="URL Produk", placeholder="URL produk dari Shopee Indonesia"
101
- )
102
- query_box = gr.Textbox(
103
- lines=2,
104
- label="Kueri",
105
- placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
106
- )
107
-
108
- gr.Interface(
109
- fn=generate,
110
- inputs=[product_box, query_box],
111
- outputs=gr.Textbox(label="Jawaban"),
112
- title="RingkasUlas",
113
- description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Shopee Indonesia (https://shopee.co.id/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Shopee dan menyiapkan jawabannya.",
114
- allow_flagging="never",
115
- examples=[
116
- [
117
- "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
118
- "Apa yang orang katakan tentang kualitas produknya?",
119
- ],
120
- [
121
- "https://shopee.co.id/Bantal-Selimut-Balmut-Mini-Karakter-kain-CVC-i.2392232.8965506?xptdk=324a77c0-7860-4059-b00d-5d3b340f8dfe",
122
- "Bagaimana pendapat orang yang kurang puas dengan produknya?",
123
- ],
124
- ],
125
- ).launch()
 
1
+ import re
2
+ from urllib.parse import urlparse, parse_qs
3
+ import pandas as pd
4
+ import unicodedata as uni
5
+ import emoji
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain.document_loaders import DataFrameLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.vectorstores import FAISS
11
+ from langchain.chains import RetrievalQA
12
+ from tokopedia import request_product_id, request_product_review
13
+ import gradio as gr
14
+
15
+ shop_id = ""
16
+ item_id = ""
17
+ item = {}
18
+ LIMIT = 1000 # Limit to 1000 reviews so that processing does not take too long
19
+
20
+ def scrape(URL, max_reviews=LIMIT):
21
+ parsed_url = urlparse(URL)
22
+ *_, SHOP, PRODUCT_KEY = parsed_url.path.split("/")
23
+ product_id = request_product_id(SHOP, PRODUCT_KEY).json()["data"]["pdpGetLayout"][
24
+ "basicInfo"
25
+ ]["id"]
26
+ all_reviews = []
27
+ page = 1
28
+ has_next = True
29
+
30
+ while has_next and len(all_reviews) <= max_reviews:
31
+ response = request_product_review(product_id, page=page)
32
+ data = response.json()["data"]["productrevGetProductReviewList"]
33
+ reviews = data["list"]
34
+ all_reviews.extend(reviews)
35
+ has_next = data["hasNext"]
36
+ page += 1
37
+
38
+ reviews_df = pd.json_normalize(all_reviews)
39
+ return reviews_df
40
+
41
+ # Clean
42
+ def clean(df):
43
+ df = df.dropna().copy().reset_index(drop=True) # drop reviews with empty comments
44
+ df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
45
+ df["comment"] = df["comment"].apply(lambda x: clean_text(x)) # clean text
46
+ df = df[df["comment"] != ""].reset_index(drop=True) # remove empty reviews
47
+ return df
48
+
49
+
50
+ def clean_text(text):
51
+ text = uni.normalize("NFKD", text) # normalise characters
52
+ text = emoji.replace_emoji(text, "") # remove emoji
53
+ text = re.sub(r"(\w)\1{2,}", r"\1", text) # repeated chars
54
+ text = re.sub(r"[ ]+", " ", text).strip() # remove extra spaces
55
+ return text
56
+
57
+
58
+ # LLM
59
+ OpenAIModel = "gpt-3.5-turbo"
60
+ llm = ChatOpenAI(model=OpenAIModel, temperature=0.1)
61
+
62
+ # Embeddings
63
+ embeddings = HuggingFaceEmbeddings(model_name="Blaxzter/LaBSE-sentence-embeddings")
64
+
65
+ cache_URL = ""
66
+ db = None
67
+ qa = None
68
+
69
+
70
+ def generate(URL, query):
71
+ global cache_URL, db, qa
72
+ if URL != cache_URL:
73
+ # Get reviews
74
+ try:
75
+ reviews = scrape(URL)
76
+ # Clean reviews
77
+ cleaned_reviews = clean(reviews)
78
+ # Load data
79
+ loader = DataFrameLoader(cleaned_reviews, page_content_column="comment")
80
+ documents = loader.load()
81
+ except Exception as e:
82
+ return "Error getting reviews: " + str(e)
83
+
84
+ # Split text
85
+ text_splitter = RecursiveCharacterTextSplitter(
86
+ chunk_size=1000, chunk_overlap=50
87
+ )
88
+ docs = text_splitter.split_documents(documents)
89
+ cache_URL = URL
90
+ # Vector store
91
+ db = FAISS.from_documents(docs, embeddings)
92
+ # Chain to answer questions
93
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
94
+ return qa.run(query)
95
+
96
+
97
+ # Gradio
98
+ product_box = gr.Textbox(
99
+ label="URL Produk", placeholder="URL produk dari Tokopedia"
100
+ )
101
+ query_box = gr.Textbox(
102
+ lines=2,
103
+ label="Kueri",
104
+ placeholder="Contoh: Apa yang orang katakan tentang kualitas produknya?, Bagaimana pendapat orang yang kurang puas dengan produknya?",
105
+ )
106
+
107
+ gr.Interface(
108
+ fn=generate,
109
+ inputs=[product_box, query_box],
110
+ outputs=gr.Textbox(label="Jawaban"),
111
+ title="RingkasUlas",
112
+ description="Bot percakapan yang bisa meringkas ulasan-ulasan produk di Tokopedia Indonesia (https://tokopedia.com/). Harap bersabar, bot ini dapat memakan waktu agak lama saat mengambil ulasan dari Tokopedia dan menyiapkan jawabannya.",
113
+ allow_flagging="never",
114
+ examples=[
115
+ [
116
+ "https://www.tokopedia.com/benitashop/telur-asin-powder-madam-kwan-golden-salted-egg-powder",
117
+ "Berapa lama produknya bisa bertahan?",
118
+ ],
119
+ [
120
+ "https://www.tokopedia.com/benitashop/telur-asin-powder-madam-kwan-golden-salted-egg-powder",
121
+ "Produknya bisa dipakai untuk memasak apa?",
122
+ ],
123
+ ],
124
+ ).launch()