Spaces:

monsimas
/

categorizer-les-propals

Sleeping

App Files Files Community

categorizer-les-propals / app.py

monsimas

change temp, top p and max tokens

9d0a6de about 2 months ago

raw history blame contribute delete

No virus

17.2 kB

	import pandas as pd
	import gradio as gr
	import io
	import base64
	import os
	import time
	import logging
	from mistralai.client import MistralClient
	from mistralai.models.chat_completion import ChatMessage
	from fuzzywuzzy import fuzz
	import re
	import json
	import requests
	import pandas as pd
	from urllib.parse import urlparse

	# Initialize logging
	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

	# Set up Mistral AI Client
	client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"])
	model = "open-mixtral-8x7b"

	messages = {
	'English': {
	'multiple_categories_system_message': "Categorize strictly in the relevant categories among: {}. Avoid justifications; avoid far-fetched connections and favor direct links; exclude irrelevant categories. If none fit, reply with '{{\"category\": \"None fit\"}}'. Example response: '{{\"category\": [\"ecology\", \"education\"]}}'.",
	'single_category_system_message': "Categorize strictly in the most relevant category among: {}. Avoid justifications; avoid far-fetched connections and favor direct links; exclude irrelevant categories. If none fit, reply with '{{\"category\": \"None fit\"}}'. Example response: '{{\"category\": \"education\"}}'.",
	'user_message': "Please categorize this proposal strictly in one or more of the relevant categories provided. Proposal: \"{}\". "
	},
	'Spanish': {
	'multiple_categories_system_message': "Categorizar estrictamente en las categorías relevantes entre: {}. Evite justificaciones; evite conexiones forzadas y favorezca vínculos directos; excluya categorías irrelevantes. Si ninguna es adecuada, responda con '{{\"category\": \"Ninguna corresponde\"}}'. Ejemplo de respuesta: '{{\"category\": [\"ecología\", \"educación\"]}}'.",
	'single_category_system_message': "Categorizar estrictamente en la categoría más relevante entre: {}. Evite justificaciones; evite conexiones forzadas y favorezca vínculos directos; excluya categorías irrelevantes. Si ninguna es adecuada, responda con '{{\"category\": \"Ninguna corresponde\"}}'. Ejemplo de respuesta: '{{\"category\": \"educación\"}}'.",
	'user_message': "Por favor, categorice esta propuesta estrictamente en una o más de las categorías relevantes proporcionadas. Propuesta: \"{}\"."
	},
	'French': {
	'multiple_categories_system_message': "Catégorisez strictement dans les catégories pertinentes parmi : {}. Évitez les justifications ; évitez les connexions tirées par les cheveux et privilégiez le lien direct ; excluez les catégories non pertinentes. Si aucune ne convient, répondez par '{{\"category\": \"Aucune correspondance\"}}'. Exemple de réponse : '{{\"category\": [\"écologie\", \"éducation\"]}}'.",
	'single_category_system_message': "Catégorisez strictement dans la catégorie la plus pertinente parmi : {}. Évitez les justifications ; évitez les connexions tirées par les cheveux et privilégiez le lien direct ; excluez les catégories non pertinentes. Si aucune ne convient, répondez par '{{\"category\": \"Aucune correspondance\"}}'. Exemple de réponse : '{{\"category\": \"éducation\"}}'.",
	'user_message': "Veuillez catégoriser cette proposition strictement dans une ou plusieurs des catégories pertinentes fournies. Proposition : \"{}\"."
	}
	}


	def get_api_url_from_process_url(process_url):
	try:
	parsed_url = urlparse(process_url)
	if not all([parsed_url.scheme, parsed_url.netloc]):
	raise ValueError("URL is missing scheme or netloc.")
	base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
	api_url = f"{base_url}/api"
	return api_url
	except ValueError as e:
	print(e)
	return None


	def make_request(client, model, system_message, user_message, temperature=0.3, top_p=0.6, max_tokens=50):
	""" Make a single request to the Mistral model, including both system and user messages, and return the response content. """
	print(f"Making request to model with temperature: {temperature}, top_p: {top_p}, max_tokens: {max_tokens}")
	messages = [
	ChatMessage(role="system", content=system_message),
	ChatMessage(role="user", content=user_message)
	]
	response = client.chat(model=model, messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens)
	print(f"Response received from model: {response.choices[0].message.content if response.choices else 'No response'}")
	return response.choices[0].message.content if response.choices else ""


	def parse_category(response, categories):
	"""
	First, try to parse the response as JSON and look for the 'category' key.
	If parsing fails or if the 'category' key is not found, use fuzzy matching.
	"""
	print(f"Parsing category from response: {response}")
	try:
	response_json = json.loads(response)
	# Check if the 'category' key exists and has a valid value
	if 'category' in response_json:
	category_response = response_json['category']
	if isinstance(category_response, list):
	# If multiple categories, check each against the provided categories
	matches = [cat for cat in category_response if cat.lower() in [c.lower() for c in categories]]
	if matches:
	return ', '.join(matches)
	elif isinstance(category_response, str) and category_response.lower() in [c.lower() for c in categories]:
	# Single category match
	return category_response
	elif category_response.lower() == "none fit":
	return "Request human review"
	except json.JSONDecodeError:
	print("Response is not in JSON format. Proceeding with fuzzy matching.")

	# Fallback to fuzzy matching if JSON parsing fails or no valid category is found
	best_match = None
	highest_score = 0
	for category in categories:
	score = fuzz.partial_ratio(category.lower(), response.lower())
	if score > highest_score:
	best_match = category
	highest_score = score
	print(f"New best match found: {best_match} with score {score}")

	if highest_score >= 80:
	return best_match
	else:
	return "Request human review"


	def categorize(proposal, categories, allow_multiple_categories, language='English', retries=3, is_single_proposal=True):
	"""
	Attempt to categorize a proposal strictly using JSON responses.
	Allows for multiple attempts with temperature adjustments to encourage clearer responses.
	"""
	print(f"Attempting to categorize proposal: {proposal}")
	initial_temperature = 0.3 # Starting with a more conservative attempt
	temperature_increment = 0.15 # Increase temperature on each retry to encourage clearer responses

	categories_str = ", ".join(categories)

	if allow_multiple_categories:
	system_message = messages[language]['multiple_categories_system_message'].format(categories_str)
	else:
	system_message = messages[language]['single_category_system_message'].format(categories_str)

	user_message = messages[language]['user_message'].format(proposal)

	for attempt in range(retries):
	temperature = initial_temperature + attempt * temperature_increment
	print(f"Attempt {attempt + 1}/{retries}, using temperature: {temperature}")

	response = make_request(client, model, system_message, user_message, temperature=0.3, top_p=0.6, max_tokens=50)

	category = extract_json_category(response, categories)
	if category:
	print(f"Categorization successful: {category}")
	return category
	else:
	print(f"Response did not meet criteria or was not in JSON. Retrying with temperature: {temperature + temperature_increment}")

	print("Human intervention required")
	return "Human intervention required"


	def extract_json_category(response, categories):
	try:
	# Use a regular expression to extract the JSON-like content
	json_match = re.search(r'{.*}', response, re.DOTALL)
	if json_match:
	json_content = json_match.group()
	response_json = json.loads(json_content)
	if 'category' in response_json:
	category_val = response_json['category']
	if isinstance(category_val, str):
	if fuzz.ratio(category_val.lower(), "none fit") >= 80:
	return "Intervention humaine requise"
	else:
	best_match = None
	highest_score = 0
	for category in categories:
	score = fuzz.ratio(category_val.lower(), category.lower())
	if score > highest_score:
	best_match = category
	highest_score = score
	if highest_score >= 80:
	return best_match
	elif isinstance(category_val, list):
	valid_categories = []
	for cat in category_val:
	best_match = None
	highest_score = 0
	for category in categories:
	score = fuzz.ratio(cat.lower(), category.lower())
	if score > highest_score:
	best_match = category
	highest_score = score
	if highest_score >= 80:
	valid_categories.append(best_match)
	if valid_categories:
	return ', '.join(valid_categories)
	return "Intervention humaine requise"
	except json.JSONDecodeError:
	print("Response is not valid JSON.")
	return None

	def fetch_and_save_proposals(input_url, file_name="proposals.xlsx", locale="en"):
	# Use urlparse to parse the input URL and construct the base URL
	parsed_url = urlparse(input_url)
	base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

	# Extract the participatory process slug from the input URL
	participatory_process_slug = input_url.split("/processes/")[-1].split("/f/")[0]

	# Construct the API URL
	api_url = get_api_url_from_process_url(input_url)
	if not api_url:
	print("Invalid URL provided.")
	return None


	query = f"""
	{{
	participatoryProcess(slug: "{participatory_process_slug}") {{
	components {{
	id
	... on Proposals {{
	proposals {{
	edges {{
	node {{
	id
	title {{
	translation(locale: "{locale}")
	}}
	body {{
	translation(locale: "{locale}")
	}}
	createdAt
	}}
	}}
	}}
	}}
	}}
	}}
	}}
	"""
	print(f"Making request to API URL: {api_url}")
	response = requests.post(api_url, json={'query': query})

	if response.status_code == 200:
	data = response.json()
	if data.get('data') and data['data'].get('participatoryProcess') and data['data']['participatoryProcess'].get('components'):
	proposals = []
	for component in data['data']['participatoryProcess']['components']:
	if 'proposals' in component:
	for edge in component['proposals']['edges']:
	node = edge['node']
	proposals.append(f"{node['title']['translation']}:{node['body']['translation']}")

	df = pd.DataFrame(proposals, columns=["Proposal"])
	df.to_excel(file_name, index=False)
	return file_name
	else:
	# Handle case where the expected data is missing
	raise Exception("The expected data was not found in the response.")
	else:
	raise Exception(f"Query failed to run by returning code of {response.status_code}.")



	def pipeline(excel_file, categories, allow_multiple_categories):
	"""
	Process each proposal in the Excel file and categorize it.
	"""
	df = pd.read_excel(excel_file, header=None)
	results = []
	for proposal in df[0]:
	print(f"Processing proposal: {proposal}")
	category = categorize(proposal, categories, allow_multiple_categories)
	results.append((proposal, category))

	results_df = pd.DataFrame(results, columns=['Proposal', 'Categorized As'])
	print("Processing complete.")
	return results_df



	def process_inputs(file_upload, url_input, categories_str, allow_mult, language):
	categories = [cat.strip() for cat in categories_str.split(',')]

	# Initialize df as None to check later if it has been assigned
	df = None

	# Decide between file upload and URL input, prioritizing file upload if both are provided.
	if file_upload:
	# Process Excel file upload
	df = pd.read_excel(file_upload, header=None)
	elif url_input:
	# Process URL input if no file was uploaded
	locale = "fr" if language == "French" else "en" if language == "English" else "es"
	file_name = fetch_and_save_proposals(url_input, locale=locale)
	if file_name:
	df = pd.read_excel(file_name, header=None)
	else:
	# This means fetching proposals failed, likely due to URL issues.
	return "URL issue. Please try again.", "", None, None

	if df is None:
	# No input provided or failed to fetch proposals
	return "Please enter either a URL or upload an Excel file.", "", None, None

	# Proceed with categorization if df is valid
	results = []
	for index, row in df.iterrows():
	proposal = row[0]
	category = categorize(proposal, categories, allow_mult, language)
	results.append((proposal, category))

	results_df = pd.DataFrame(results, columns=['Proposal', 'Category'])
	output_html = results_df.to_html(escape=False, index=False)

	# Generate Excel file for download without the 'encoding' parameter
	output = io.BytesIO()
	results_df.to_excel(output, index=False) # Corrected line
	output.seek(0)
	excel_base64 = base64.b64encode(output.read()).decode('utf-8')
	download_link = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{excel_base64}" download="categorized_results.xlsx">Download Excel file</a>'

	return output_html, download_link

	with gr.Blocks() as demo:
	gr.Markdown("# 📊 Automatic proposal categorization")
	gr.Markdown("This tool allows you to categorize proposals. It's primary intent is to do that for <a href='https://decidim.org' target='_blank'>Decidim</a> (open source participatory democracy framework) proposals, but you can use it for any proposals you want.")
	gr.Markdown("<i>The tool was built by the team at <a href='https://opensourcepolitics.eu' target='_blank'>Open Source Politics</a>.</i>")


	with gr.Row():
	with gr.Column():
	gr.Markdown("### 1st option : Upload an Excel file with one column of proposals")
	file_upload = gr.File(label="📁 Upload the Excel file", file_types=["xlsx"])
	with gr.Column():
	gr.Markdown("### 2nd option : Classify Decidim proposals")
	url_input = gr.Textbox(label="🌐 Proposal component URL", placeholder="Enter the URL here")

	language_select = gr.Dropdown(choices=['English', 'Spanish', 'French'], label="Select Language", value='English')
	categories_input = gr.Textbox(label="🏷️ Enter categories (separated by commas)", placeholder="for example: Education, Health, Environnement, Other")
	allow_multiple_categories = gr.Checkbox(label="✅ Allow more than one category per proposal", value=True)


	submit_button = gr.Button("🚀 Find the categories", variant="primary")
	output_html = gr.HTML(label="📋 Results")
	output_download = gr.HTML(label="📥 Download results in XLSX")


	submit_button.click(
	fn=lambda file_upload, url_input, categories_input, allow_multiple_categories, language_select: process_inputs(
	file_upload, url_input, categories_input, allow_multiple_categories, language_select
	),
	inputs=[file_upload, url_input, categories_input, allow_multiple_categories, language_select],
	outputs=[output_html, output_download]
	)


	gr.Markdown("### Instructions:")
	gr.Markdown("1. Excel Upload: The Excel file should contain the proposals listed in the first column without any header.")
	gr.Markdown("2. Proposals Component URL: Provide the full URL of a proposals component page where the proposals are listed.")
	gr.Markdown("3. Categories: Enter the categories separated by commas. For example, Education, Health, Environment.")
	gr.Markdown("4. Multiple Categories: Check this box if you want to allow proposals to be categorized into more than one category.")

	demo.launch()