monsimas's picture
change temp, top p and max tokens
9d0a6de
import pandas as pd
import gradio as gr
import io
import base64
import os
import time
import logging
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from fuzzywuzzy import fuzz
import re
import json
import requests
import pandas as pd
from urllib.parse import urlparse
# Initialize logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Set up Mistral AI Client
client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"])
model = "open-mixtral-8x7b"
messages = {
'English': {
'multiple_categories_system_message': "Categorize strictly in the relevant categories among: {}. Avoid justifications; avoid far-fetched connections and favor direct links; exclude irrelevant categories. If none fit, reply with '{{\"category\": \"None fit\"}}'. Example response: '{{\"category\": [\"ecology\", \"education\"]}}'.",
'single_category_system_message': "Categorize strictly in the most relevant category among: {}. Avoid justifications; avoid far-fetched connections and favor direct links; exclude irrelevant categories. If none fit, reply with '{{\"category\": \"None fit\"}}'. Example response: '{{\"category\": \"education\"}}'.",
'user_message': "Please categorize this proposal strictly in one or more of the relevant categories provided. Proposal: \"{}\". "
},
'Spanish': {
'multiple_categories_system_message': "Categorizar estrictamente en las categorías relevantes entre: {}. Evite justificaciones; evite conexiones forzadas y favorezca vínculos directos; excluya categorías irrelevantes. Si ninguna es adecuada, responda con '{{\"category\": \"Ninguna corresponde\"}}'. Ejemplo de respuesta: '{{\"category\": [\"ecología\", \"educación\"]}}'.",
'single_category_system_message': "Categorizar estrictamente en la categoría más relevante entre: {}. Evite justificaciones; evite conexiones forzadas y favorezca vínculos directos; excluya categorías irrelevantes. Si ninguna es adecuada, responda con '{{\"category\": \"Ninguna corresponde\"}}'. Ejemplo de respuesta: '{{\"category\": \"educación\"}}'.",
'user_message': "Por favor, categorice esta propuesta estrictamente en una o más de las categorías relevantes proporcionadas. Propuesta: \"{}\"."
},
'French': {
'multiple_categories_system_message': "Catégorisez strictement dans les catégories pertinentes parmi : {}. Évitez les justifications ; évitez les connexions tirées par les cheveux et privilégiez le lien direct ; excluez les catégories non pertinentes. Si aucune ne convient, répondez par '{{\"category\": \"Aucune correspondance\"}}'. Exemple de réponse : '{{\"category\": [\"écologie\", \"éducation\"]}}'.",
'single_category_system_message': "Catégorisez strictement dans la catégorie la plus pertinente parmi : {}. Évitez les justifications ; évitez les connexions tirées par les cheveux et privilégiez le lien direct ; excluez les catégories non pertinentes. Si aucune ne convient, répondez par '{{\"category\": \"Aucune correspondance\"}}'. Exemple de réponse : '{{\"category\": \"éducation\"}}'.",
'user_message': "Veuillez catégoriser cette proposition strictement dans une ou plusieurs des catégories pertinentes fournies. Proposition : \"{}\"."
}
}
def get_api_url_from_process_url(process_url):
try:
parsed_url = urlparse(process_url)
if not all([parsed_url.scheme, parsed_url.netloc]):
raise ValueError("URL is missing scheme or netloc.")
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
api_url = f"{base_url}/api"
return api_url
except ValueError as e:
print(e)
return None
def make_request(client, model, system_message, user_message, temperature=0.3, top_p=0.6, max_tokens=50):
""" Make a single request to the Mistral model, including both system and user messages, and return the response content. """
print(f"Making request to model with temperature: {temperature}, top_p: {top_p}, max_tokens: {max_tokens}")
messages = [
ChatMessage(role="system", content=system_message),
ChatMessage(role="user", content=user_message)
]
response = client.chat(model=model, messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens)
print(f"Response received from model: {response.choices[0].message.content if response.choices else 'No response'}")
return response.choices[0].message.content if response.choices else ""
def parse_category(response, categories):
"""
First, try to parse the response as JSON and look for the 'category' key.
If parsing fails or if the 'category' key is not found, use fuzzy matching.
"""
print(f"Parsing category from response: {response}")
try:
response_json = json.loads(response)
# Check if the 'category' key exists and has a valid value
if 'category' in response_json:
category_response = response_json['category']
if isinstance(category_response, list):
# If multiple categories, check each against the provided categories
matches = [cat for cat in category_response if cat.lower() in [c.lower() for c in categories]]
if matches:
return ', '.join(matches)
elif isinstance(category_response, str) and category_response.lower() in [c.lower() for c in categories]:
# Single category match
return category_response
elif category_response.lower() == "none fit":
return "Request human review"
except json.JSONDecodeError:
print("Response is not in JSON format. Proceeding with fuzzy matching.")
# Fallback to fuzzy matching if JSON parsing fails or no valid category is found
best_match = None
highest_score = 0
for category in categories:
score = fuzz.partial_ratio(category.lower(), response.lower())
if score > highest_score:
best_match = category
highest_score = score
print(f"New best match found: {best_match} with score {score}")
if highest_score >= 80:
return best_match
else:
return "Request human review"
def categorize(proposal, categories, allow_multiple_categories, language='English', retries=3, is_single_proposal=True):
"""
Attempt to categorize a proposal strictly using JSON responses.
Allows for multiple attempts with temperature adjustments to encourage clearer responses.
"""
print(f"Attempting to categorize proposal: {proposal}")
initial_temperature = 0.3 # Starting with a more conservative attempt
temperature_increment = 0.15 # Increase temperature on each retry to encourage clearer responses
categories_str = ", ".join(categories)
if allow_multiple_categories:
system_message = messages[language]['multiple_categories_system_message'].format(categories_str)
else:
system_message = messages[language]['single_category_system_message'].format(categories_str)
user_message = messages[language]['user_message'].format(proposal)
for attempt in range(retries):
temperature = initial_temperature + attempt * temperature_increment
print(f"Attempt {attempt + 1}/{retries}, using temperature: {temperature}")
response = make_request(client, model, system_message, user_message, temperature=0.3, top_p=0.6, max_tokens=50)
category = extract_json_category(response, categories)
if category:
print(f"Categorization successful: {category}")
return category
else:
print(f"Response did not meet criteria or was not in JSON. Retrying with temperature: {temperature + temperature_increment}")
print("Human intervention required")
return "Human intervention required"
def extract_json_category(response, categories):
try:
# Use a regular expression to extract the JSON-like content
json_match = re.search(r'{.*}', response, re.DOTALL)
if json_match:
json_content = json_match.group()
response_json = json.loads(json_content)
if 'category' in response_json:
category_val = response_json['category']
if isinstance(category_val, str):
if fuzz.ratio(category_val.lower(), "none fit") >= 80:
return "Intervention humaine requise"
else:
best_match = None
highest_score = 0
for category in categories:
score = fuzz.ratio(category_val.lower(), category.lower())
if score > highest_score:
best_match = category
highest_score = score
if highest_score >= 80:
return best_match
elif isinstance(category_val, list):
valid_categories = []
for cat in category_val:
best_match = None
highest_score = 0
for category in categories:
score = fuzz.ratio(cat.lower(), category.lower())
if score > highest_score:
best_match = category
highest_score = score
if highest_score >= 80:
valid_categories.append(best_match)
if valid_categories:
return ', '.join(valid_categories)
return "Intervention humaine requise"
except json.JSONDecodeError:
print("Response is not valid JSON.")
return None
def fetch_and_save_proposals(input_url, file_name="proposals.xlsx", locale="en"):
# Use urlparse to parse the input URL and construct the base URL
parsed_url = urlparse(input_url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
# Extract the participatory process slug from the input URL
participatory_process_slug = input_url.split("/processes/")[-1].split("/f/")[0]
# Construct the API URL
api_url = get_api_url_from_process_url(input_url)
if not api_url:
print("Invalid URL provided.")
return None
query = f"""
{{
participatoryProcess(slug: "{participatory_process_slug}") {{
components {{
id
... on Proposals {{
proposals {{
edges {{
node {{
id
title {{
translation(locale: "{locale}")
}}
body {{
translation(locale: "{locale}")
}}
createdAt
}}
}}
}}
}}
}}
}}
}}
"""
print(f"Making request to API URL: {api_url}")
response = requests.post(api_url, json={'query': query})
if response.status_code == 200:
data = response.json()
if data.get('data') and data['data'].get('participatoryProcess') and data['data']['participatoryProcess'].get('components'):
proposals = []
for component in data['data']['participatoryProcess']['components']:
if 'proposals' in component:
for edge in component['proposals']['edges']:
node = edge['node']
proposals.append(f"{node['title']['translation']}:{node['body']['translation']}")
df = pd.DataFrame(proposals, columns=["Proposal"])
df.to_excel(file_name, index=False)
return file_name
else:
# Handle case where the expected data is missing
raise Exception("The expected data was not found in the response.")
else:
raise Exception(f"Query failed to run by returning code of {response.status_code}.")
def pipeline(excel_file, categories, allow_multiple_categories):
"""
Process each proposal in the Excel file and categorize it.
"""
df = pd.read_excel(excel_file, header=None)
results = []
for proposal in df[0]:
print(f"Processing proposal: {proposal}")
category = categorize(proposal, categories, allow_multiple_categories)
results.append((proposal, category))
results_df = pd.DataFrame(results, columns=['Proposal', 'Categorized As'])
print("Processing complete.")
return results_df
def process_inputs(file_upload, url_input, categories_str, allow_mult, language):
categories = [cat.strip() for cat in categories_str.split(',')]
# Initialize df as None to check later if it has been assigned
df = None
# Decide between file upload and URL input, prioritizing file upload if both are provided.
if file_upload:
# Process Excel file upload
df = pd.read_excel(file_upload, header=None)
elif url_input:
# Process URL input if no file was uploaded
locale = "fr" if language == "French" else "en" if language == "English" else "es"
file_name = fetch_and_save_proposals(url_input, locale=locale)
if file_name:
df = pd.read_excel(file_name, header=None)
else:
# This means fetching proposals failed, likely due to URL issues.
return "URL issue. Please try again.", "", None, None
if df is None:
# No input provided or failed to fetch proposals
return "Please enter either a URL or upload an Excel file.", "", None, None
# Proceed with categorization if df is valid
results = []
for index, row in df.iterrows():
proposal = row[0]
category = categorize(proposal, categories, allow_mult, language)
results.append((proposal, category))
results_df = pd.DataFrame(results, columns=['Proposal', 'Category'])
output_html = results_df.to_html(escape=False, index=False)
# Generate Excel file for download without the 'encoding' parameter
output = io.BytesIO()
results_df.to_excel(output, index=False) # Corrected line
output.seek(0)
excel_base64 = base64.b64encode(output.read()).decode('utf-8')
download_link = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{excel_base64}" download="categorized_results.xlsx">Download Excel file</a>'
return output_html, download_link
with gr.Blocks() as demo:
gr.Markdown("# 📊 Automatic proposal categorization")
gr.Markdown("This tool allows you to categorize proposals. It's primary intent is to do that for <a href='https://decidim.org' target='_blank'>Decidim</a> (open source participatory democracy framework) proposals, but you can use it for any proposals you want.")
gr.Markdown("<i>The tool was built by the team at <a href='https://opensourcepolitics.eu' target='_blank'>Open Source Politics</a>.</i>")
with gr.Row():
with gr.Column():
gr.Markdown("### 1st option : Upload an Excel file with one column of proposals")
file_upload = gr.File(label="📁 Upload the Excel file", file_types=["xlsx"])
with gr.Column():
gr.Markdown("### 2nd option : Classify Decidim proposals")
url_input = gr.Textbox(label="🌐 Proposal component URL", placeholder="Enter the URL here")
language_select = gr.Dropdown(choices=['English', 'Spanish', 'French'], label="Select Language", value='English')
categories_input = gr.Textbox(label="🏷️ Enter categories (separated by commas)", placeholder="for example: Education, Health, Environnement, Other")
allow_multiple_categories = gr.Checkbox(label="✅ Allow more than one category per proposal", value=True)
submit_button = gr.Button("🚀 Find the categories", variant="primary")
output_html = gr.HTML(label="📋 Results")
output_download = gr.HTML(label="📥 Download results in XLSX")
submit_button.click(
fn=lambda file_upload, url_input, categories_input, allow_multiple_categories, language_select: process_inputs(
file_upload, url_input, categories_input, allow_multiple_categories, language_select
),
inputs=[file_upload, url_input, categories_input, allow_multiple_categories, language_select],
outputs=[output_html, output_download]
)
gr.Markdown("### Instructions:")
gr.Markdown("1. Excel Upload: The Excel file should contain the proposals listed in the first column without any header.")
gr.Markdown("2. Proposals Component URL: Provide the full URL of a proposals component page where the proposals are listed.")
gr.Markdown("3. Categories: Enter the categories separated by commas. For example, Education, Health, Environment.")
gr.Markdown("4. Multiple Categories: Check this box if you want to allow proposals to be categorized into more than one category.")
demo.launch()