Spaces:

Seventy4057
/

classify-sentences-api

Paused

jmemcc

reduced functionality

35768cd about 1 year ago

4.88 kB

	import gradio
	import openai
	import os
	import shutil
	import zipfile
	import uuid
	import threading
	import time
	from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI
	from sklearn.feature_extraction.text import CountVectorizer
	from umap import UMAP
	from hdbscan import HDBSCAN
	from bertopic import BERTopic
	from sentence_transformers import SentenceTransformer
	from transformers import AutoTokenizer, AutoModel
	import numpy as np
	import torch
	import torch.nn.functional as F

	# Mean Pooling - Take attention mask into account for correct averaging
	def mean_pooling(model_output, attention_mask):
	token_embeddings = model_output[0] # First element of model_output contains all token embeddings
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

	def BETA_run_topic_classification(list_of_sentences: list, n_neighbors=10, n_components=3, min_cluster_size=5):
	try:
	# Load model from HuggingFace Hub
	tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
	model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

	# Tokenize sentences
	encoded_input = tokenizer(list_of_sentences, padding=True, truncation=True, return_tensors='pt')

	# Compute token embeddings
	with torch.no_grad():
	model_output = model(**encoded_input)

	# Perform pooling
	sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

	# Normalize embeddings
	sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

	# Convert to numpy array
	sentence_embeddings = sentence_embeddings.cpu().numpy()

	# Debug print for embeddings
	print(f"Embeddings shape: {sentence_embeddings.shape}")
	print(f"Embeddings: {sentence_embeddings}")

	umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42)
	vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

	hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
	keybert_model = KeyBERTInspired()
	mmr_model = MaximalMarginalRelevance(diversity=0.3)
	prompt = """
	I have a topic that contains the following documents:
	[DOCUMENTS]
	The topic is described by the following keywords: [KEYWORDS]

	Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
	topic: <topic label>
	"""
	client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
	openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

	representation_model = {"KeyBERT": keybert_model, "OpenAI": openai_model, "MMR": mmr_model}

	topic_model = BERTopic(
	embedding_model=model,
	umap_model=umap_model,
	hdbscan_model=hdbscan_model,
	vectorizer_model=vectorizer_model,
	representation_model=representation_model,
	low_memory=True,
	top_n_words=10,
	verbose=True
	)

	topics, probs = topic_model.fit_transform(list_of_sentences, sentence_embeddings)
	chatgpt_topic_labels = {str(topic): " \| ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
	chatgpt_topic_labels["-1"] = "Outlier Topic"
	topic_model.set_topic_labels(chatgpt_topic_labels)

	topic_distr, _ = topic_model.approximate_distribution(list_of_sentences, window=8, stride=4)

	topics, probs = topic_model.fit_transform(list_of_sentences, sentence_embeddings)

	# Debug print for topics and probs
	print(f"Topics: {topics}")
	print(f"Probs: {probs}")

	return topics

	except Exception as e:
	print(f"An error occurred: {e}")
	return str(e), None

	def my_inference_function(sentences):
	if not isinstance(sentences, str):
	return "Input should be a string of sentences separated by commas."

	sentences_list = [sentence.strip() for sentence in sentences.split(',')]
	sentences_list = [sentence for sentence in sentences_list if sentence]

	if not sentences_list:
	return "No valid sentences provided."

	topics = BETA_run_topic_classification(sentences_list)

	return topics # Return the path to the zip file

	gradio_interface = gradio.Interface(
	fn = my_inference_function,
	inputs = "text",
	outputs = "text"
	)
	gradio_interface.launch()