Spaces:

cuongnguyen910
/

topic-clustering-global-dashboard

Build error

App Files Files Community

topic-clustering-global-dashboard / service_cache_v2.py

cuongnguyen910

Upload folder using huggingface_hub

5120311 verified 3 months ago

raw

history blame contribute delete

3.52 kB

	from entity import Docs, Cluster, Preprocess, SummaryInput, DocsWithSamples
	from fastapi import FastAPI
	import time
	import hashlib
	import json
	from fastapi.middleware.cors import CORSMiddleware
	# from function import topic_clustering as tc
	from function import topic_clustering_v2 as tc
	from iclibs.ic_rabbit import ICRabbitMQ
	from get_config import config_params
	from summary import postprocess_title_clusters

	app = FastAPI()
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	def get_hash_id(item: DocsWithSamples):
	str_hash = ""
	for it in item.response["docs"]:
	str_hash += it["url"]
	str_hash += str(item.top_cluster)
	str_hash += str(item.top_sentence)
	str_hash += str(item.topn_summary)
	str_hash += str(item.top_doc)
	str_hash += str(item.threshold)
	if item.sorted_field.strip():
	str_hash += str(item.sorted_field)
	if item.delete_message:
	str_hash += str(item.delete_message)
	return hashlib.sha224(str_hash.encode("utf-8")).hexdigest()


	try:
	with open("log_run/log.txt") as f:
	data_dict = json.load(f)
	except Exception as ve:
	print(ve)
	data_dict = {}


	@app.post("/newsanalysis/topic_clustering")
	async def topic_clustering(item: DocsWithSamples):
	docs = item.response["docs"]
	# threshold = item.threshold
	print("start ")
	print("len doc: ", len(docs))
	st_time = time.time()
	top_cluster = item.top_cluster
	top_sentence = item.top_sentence
	topn_summary = item.topn_summary
	sorted_field = item.sorted_field
	max_doc_per_cluster = item.max_doc_per_cluster
	hash_str = get_hash_id(item)
	# threshold = 0.1
	# item.threshold = threshold

	print(hash_str)
	if len(docs) > 200:

	try:
	if hash_str in data_dict:
	path_res = data_dict[hash_str]["response_path"]
	with open(path_res) as ff:
	results = json.load(ff)
	print("time analysis (cache): ", time.time() - st_time)
	return results
	except Exception as vee:
	print(vee)

	results = tc.topic_clustering(docs, item.threshold, top_cluster=top_cluster, top_sentence=top_sentence,
	topn_summary=topn_summary, sorted_field=sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=False)
	results = postprocess_title_clusters(results, delete_message=item.delete_message)

	path_res = "log/result_{0}.txt".format(hash_str)
	with open(path_res, "w+") as ff:
	ff.write(json.dumps(results))
	data_dict[hash_str] = {"time": st_time, "response_path": path_res}

	lst_rm = []
	for dt in data_dict:
	if time.time() - data_dict[dt]["time"] > 30 * 24 * 3600:
	lst_rm.append(dt)
	for dt in lst_rm:
	del data_dict[dt]
	with open("log_run/log.txt", "w+") as ff:
	ff.write(json.dumps(data_dict))
	print("time analysis: ", time.time() - st_time)
	return results

	def init_rabbit_queue(usr, passw, host, vir_host, queue_name, durable, max_priority, exchange=""):
	connection = ICRabbitMQ(host, vir_host, usr, passw)
	connection.init_connection()
	channel = connection.init_queue(
	queue_name, exchange=exchange, durable=durable, max_priority=max_priority)
	return channel, connection, queue_name