sheetbot / app.py
linpershey's picture
remove from lfs and add back
4925baf
raw history blame
No virus
3.82 kB
import os
import logging
import gradio as gr
import pandas as pd
from dotenv import load_dotenv
import jieba
jieba.cut('你好')
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt
from sheet import compose_query, get_serp, get_condensed_result, extract_results, postprocess_result, format_output, category2supercategory
load_dotenv()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def plot_wordcloud( text):
"""
"""
if os.getenv("FONT_PATH", None) is not None:
wc_generator = WordCloud(font_path=os.getenv("FONT_PATH"))
else:
wc_generator = WordCloud()
img = wc_generator.generate( " ".join(jieba.cut(text)))
# fig, ax = plt.subplots()
# ax.imshow(wordcloud, interpolation='bilinear')
# ax.axis("off")
return img.to_image()
def format_category( formatted_results):
"""
"""
return "\n\n".join([
f"> 大類別:{formatted_results['supercategory'].values[0]}",
f"> 小類別:{formatted_results['category'].values[0]}",
f"> 商家名稱:{formatted_results['store_name'].values[0]}",
f"> 電話:{formatted_results['phone_number'].values[0]}",
f"> 描述:{formatted_results['description'].values[0]}"
])
def do( business_id, business_name, address):
"""
"""
crawled_results = []
google_domain = "google.com.tw"
gl = 'tw'
lr = 'lang_zh-TW'
query = compose_query(address, business_name)
try:
res = get_serp( query, google_domain, gl, lr)
except Exception as e:
return f"Error: {e}"
cond_res = get_condensed_result(res)
crawled_results.append( {
"index": 0,
"business_id": business_id,
"business_name": business_name,
"serp": res,
"evidence": cond_res,
"address": address
} )
crawled_results = pd.DataFrame(crawled_results)
# logger.debug(crawled_results)
extracted_results = extract_results( crawled_results)
# logger.error(extracted_results['extracted_results'].columns)
extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name'] ]
postprocessed_results = postprocess_result( extracted_results, postprocessed_results_path="/tmp/postprocessed_results.joblib", category_hierarchy=category2supercategory)
os.remove("/tmp/postprocessed_results.joblib")
formatted_results = format_output( postprocessed_results)
# logger.error( formatted_results.columns)
formatted_output = format_category( formatted_results)
img = plot_wordcloud(formatted_results['formatted_evidence'].values[0])
return formatted_results['formatted_evidence'].values[0], img, formatted_output
## --- interface --- ##
# outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
# demo = gr.Interface(
# fn=do,
# inputs=[ "text", "text", "text"],
# outputs=outputs,
# )
## --- block --- ##
with gr.Blocks() as demo:
gr.Markdown("🌟 自動分類餐廳型態 🌟")
with gr.Row():
inputs = [ gr.Textbox( label="統一編號", placeholder="統一編號"), gr.Textbox(placeholder="商家名稱"), gr.Textbox(placeholder="地址")]
with gr.Row():
# outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
outputs = [ gr.Markdown( label="參考資料(google search)"), gr.Image( label="文字雲"), gr.Markdown( label="類別", )]
btn = gr.Button("Submit")
btn.click(fn=do, inputs=inputs, outputs=outputs)
if __name__ == "__main__":
demo.launch(share=True, auth=("kota", "kota"))