Spaces:
Sleeping
Sleeping
import pandas as pd | |
import gradio as gr | |
import io | |
import base64 | |
import os | |
import time | |
import logging | |
from mistralai.client import MistralClient | |
from mistralai.models.chat_completion import ChatMessage | |
from fuzzywuzzy import fuzz | |
import re | |
import json | |
import requests | |
import pandas as pd | |
from urllib.parse import urlparse | |
# Initialize logging | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Set up Mistral AI Client | |
client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"]) | |
model = "open-mixtral-8x7b" | |
messages = { | |
'English': { | |
'multiple_categories_system_message': "Categorize strictly in the relevant categories among: {}. Avoid justifications; avoid far-fetched connections and favor direct links; exclude irrelevant categories. If none fit, reply with '{{\"category\": \"None fit\"}}'. Example response: '{{\"category\": [\"ecology\", \"education\"]}}'.", | |
'single_category_system_message': "Categorize strictly in the most relevant category among: {}. Avoid justifications; avoid far-fetched connections and favor direct links; exclude irrelevant categories. If none fit, reply with '{{\"category\": \"None fit\"}}'. Example response: '{{\"category\": \"education\"}}'.", | |
'user_message': "Please categorize this proposal strictly in one or more of the relevant categories provided. Proposal: \"{}\". " | |
}, | |
'Spanish': { | |
'multiple_categories_system_message': "Categorizar estrictamente en las categorías relevantes entre: {}. Evite justificaciones; evite conexiones forzadas y favorezca vínculos directos; excluya categorías irrelevantes. Si ninguna es adecuada, responda con '{{\"category\": \"Ninguna corresponde\"}}'. Ejemplo de respuesta: '{{\"category\": [\"ecología\", \"educación\"]}}'.", | |
'single_category_system_message': "Categorizar estrictamente en la categoría más relevante entre: {}. Evite justificaciones; evite conexiones forzadas y favorezca vínculos directos; excluya categorías irrelevantes. Si ninguna es adecuada, responda con '{{\"category\": \"Ninguna corresponde\"}}'. Ejemplo de respuesta: '{{\"category\": \"educación\"}}'.", | |
'user_message': "Por favor, categorice esta propuesta estrictamente en una o más de las categorías relevantes proporcionadas. Propuesta: \"{}\"." | |
}, | |
'French': { | |
'multiple_categories_system_message': "Catégorisez strictement dans les catégories pertinentes parmi : {}. Évitez les justifications ; évitez les connexions tirées par les cheveux et privilégiez le lien direct ; excluez les catégories non pertinentes. Si aucune ne convient, répondez par '{{\"category\": \"Aucune correspondance\"}}'. Exemple de réponse : '{{\"category\": [\"écologie\", \"éducation\"]}}'.", | |
'single_category_system_message': "Catégorisez strictement dans la catégorie la plus pertinente parmi : {}. Évitez les justifications ; évitez les connexions tirées par les cheveux et privilégiez le lien direct ; excluez les catégories non pertinentes. Si aucune ne convient, répondez par '{{\"category\": \"Aucune correspondance\"}}'. Exemple de réponse : '{{\"category\": \"éducation\"}}'.", | |
'user_message': "Veuillez catégoriser cette proposition strictement dans une ou plusieurs des catégories pertinentes fournies. Proposition : \"{}\"." | |
} | |
} | |
def get_api_url_from_process_url(process_url): | |
try: | |
parsed_url = urlparse(process_url) | |
if not all([parsed_url.scheme, parsed_url.netloc]): | |
raise ValueError("URL is missing scheme or netloc.") | |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" | |
api_url = f"{base_url}/api" | |
return api_url | |
except ValueError as e: | |
print(e) | |
return None | |
def make_request(client, model, system_message, user_message, temperature=0.3, top_p=0.6, max_tokens=50): | |
""" Make a single request to the Mistral model, including both system and user messages, and return the response content. """ | |
print(f"Making request to model with temperature: {temperature}, top_p: {top_p}, max_tokens: {max_tokens}") | |
messages = [ | |
ChatMessage(role="system", content=system_message), | |
ChatMessage(role="user", content=user_message) | |
] | |
response = client.chat(model=model, messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens) | |
print(f"Response received from model: {response.choices[0].message.content if response.choices else 'No response'}") | |
return response.choices[0].message.content if response.choices else "" | |
def parse_category(response, categories): | |
""" | |
First, try to parse the response as JSON and look for the 'category' key. | |
If parsing fails or if the 'category' key is not found, use fuzzy matching. | |
""" | |
print(f"Parsing category from response: {response}") | |
try: | |
response_json = json.loads(response) | |
# Check if the 'category' key exists and has a valid value | |
if 'category' in response_json: | |
category_response = response_json['category'] | |
if isinstance(category_response, list): | |
# If multiple categories, check each against the provided categories | |
matches = [cat for cat in category_response if cat.lower() in [c.lower() for c in categories]] | |
if matches: | |
return ', '.join(matches) | |
elif isinstance(category_response, str) and category_response.lower() in [c.lower() for c in categories]: | |
# Single category match | |
return category_response | |
elif category_response.lower() == "none fit": | |
return "Request human review" | |
except json.JSONDecodeError: | |
print("Response is not in JSON format. Proceeding with fuzzy matching.") | |
# Fallback to fuzzy matching if JSON parsing fails or no valid category is found | |
best_match = None | |
highest_score = 0 | |
for category in categories: | |
score = fuzz.partial_ratio(category.lower(), response.lower()) | |
if score > highest_score: | |
best_match = category | |
highest_score = score | |
print(f"New best match found: {best_match} with score {score}") | |
if highest_score >= 80: | |
return best_match | |
else: | |
return "Request human review" | |
def categorize(proposal, categories, allow_multiple_categories, language='English', retries=3, is_single_proposal=True): | |
""" | |
Attempt to categorize a proposal strictly using JSON responses. | |
Allows for multiple attempts with temperature adjustments to encourage clearer responses. | |
""" | |
print(f"Attempting to categorize proposal: {proposal}") | |
initial_temperature = 0.3 # Starting with a more conservative attempt | |
temperature_increment = 0.15 # Increase temperature on each retry to encourage clearer responses | |
categories_str = ", ".join(categories) | |
if allow_multiple_categories: | |
system_message = messages[language]['multiple_categories_system_message'].format(categories_str) | |
else: | |
system_message = messages[language]['single_category_system_message'].format(categories_str) | |
user_message = messages[language]['user_message'].format(proposal) | |
for attempt in range(retries): | |
temperature = initial_temperature + attempt * temperature_increment | |
print(f"Attempt {attempt + 1}/{retries}, using temperature: {temperature}") | |
response = make_request(client, model, system_message, user_message, temperature=0.3, top_p=0.6, max_tokens=50) | |
category = extract_json_category(response, categories) | |
if category: | |
print(f"Categorization successful: {category}") | |
return category | |
else: | |
print(f"Response did not meet criteria or was not in JSON. Retrying with temperature: {temperature + temperature_increment}") | |
print("Human intervention required") | |
return "Human intervention required" | |
def extract_json_category(response, categories): | |
try: | |
# Use a regular expression to extract the JSON-like content | |
json_match = re.search(r'{.*}', response, re.DOTALL) | |
if json_match: | |
json_content = json_match.group() | |
response_json = json.loads(json_content) | |
if 'category' in response_json: | |
category_val = response_json['category'] | |
if isinstance(category_val, str): | |
if fuzz.ratio(category_val.lower(), "none fit") >= 80: | |
return "Intervention humaine requise" | |
else: | |
best_match = None | |
highest_score = 0 | |
for category in categories: | |
score = fuzz.ratio(category_val.lower(), category.lower()) | |
if score > highest_score: | |
best_match = category | |
highest_score = score | |
if highest_score >= 80: | |
return best_match | |
elif isinstance(category_val, list): | |
valid_categories = [] | |
for cat in category_val: | |
best_match = None | |
highest_score = 0 | |
for category in categories: | |
score = fuzz.ratio(cat.lower(), category.lower()) | |
if score > highest_score: | |
best_match = category | |
highest_score = score | |
if highest_score >= 80: | |
valid_categories.append(best_match) | |
if valid_categories: | |
return ', '.join(valid_categories) | |
return "Intervention humaine requise" | |
except json.JSONDecodeError: | |
print("Response is not valid JSON.") | |
return None | |
def fetch_and_save_proposals(input_url, file_name="proposals.xlsx", locale="en"): | |
# Use urlparse to parse the input URL and construct the base URL | |
parsed_url = urlparse(input_url) | |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" | |
# Extract the participatory process slug from the input URL | |
participatory_process_slug = input_url.split("/processes/")[-1].split("/f/")[0] | |
# Construct the API URL | |
api_url = get_api_url_from_process_url(input_url) | |
if not api_url: | |
print("Invalid URL provided.") | |
return None | |
query = f""" | |
{{ | |
participatoryProcess(slug: "{participatory_process_slug}") {{ | |
components {{ | |
id | |
... on Proposals {{ | |
proposals {{ | |
edges {{ | |
node {{ | |
id | |
title {{ | |
translation(locale: "{locale}") | |
}} | |
body {{ | |
translation(locale: "{locale}") | |
}} | |
createdAt | |
}} | |
}} | |
}} | |
}} | |
}} | |
}} | |
}} | |
""" | |
print(f"Making request to API URL: {api_url}") | |
response = requests.post(api_url, json={'query': query}) | |
if response.status_code == 200: | |
data = response.json() | |
if data.get('data') and data['data'].get('participatoryProcess') and data['data']['participatoryProcess'].get('components'): | |
proposals = [] | |
for component in data['data']['participatoryProcess']['components']: | |
if 'proposals' in component: | |
for edge in component['proposals']['edges']: | |
node = edge['node'] | |
proposals.append(f"{node['title']['translation']}:{node['body']['translation']}") | |
df = pd.DataFrame(proposals, columns=["Proposal"]) | |
df.to_excel(file_name, index=False) | |
return file_name | |
else: | |
# Handle case where the expected data is missing | |
raise Exception("The expected data was not found in the response.") | |
else: | |
raise Exception(f"Query failed to run by returning code of {response.status_code}.") | |
def pipeline(excel_file, categories, allow_multiple_categories): | |
""" | |
Process each proposal in the Excel file and categorize it. | |
""" | |
df = pd.read_excel(excel_file, header=None) | |
results = [] | |
for proposal in df[0]: | |
print(f"Processing proposal: {proposal}") | |
category = categorize(proposal, categories, allow_multiple_categories) | |
results.append((proposal, category)) | |
results_df = pd.DataFrame(results, columns=['Proposal', 'Categorized As']) | |
print("Processing complete.") | |
return results_df | |
def process_inputs(file_upload, url_input, categories_str, allow_mult, language): | |
categories = [cat.strip() for cat in categories_str.split(',')] | |
# Initialize df as None to check later if it has been assigned | |
df = None | |
# Decide between file upload and URL input, prioritizing file upload if both are provided. | |
if file_upload: | |
# Process Excel file upload | |
df = pd.read_excel(file_upload, header=None) | |
elif url_input: | |
# Process URL input if no file was uploaded | |
locale = "fr" if language == "French" else "en" if language == "English" else "es" | |
file_name = fetch_and_save_proposals(url_input, locale=locale) | |
if file_name: | |
df = pd.read_excel(file_name, header=None) | |
else: | |
# This means fetching proposals failed, likely due to URL issues. | |
return "URL issue. Please try again.", "", None, None | |
if df is None: | |
# No input provided or failed to fetch proposals | |
return "Please enter either a URL or upload an Excel file.", "", None, None | |
# Proceed with categorization if df is valid | |
results = [] | |
for index, row in df.iterrows(): | |
proposal = row[0] | |
category = categorize(proposal, categories, allow_mult, language) | |
results.append((proposal, category)) | |
results_df = pd.DataFrame(results, columns=['Proposal', 'Category']) | |
output_html = results_df.to_html(escape=False, index=False) | |
# Generate Excel file for download without the 'encoding' parameter | |
output = io.BytesIO() | |
results_df.to_excel(output, index=False) # Corrected line | |
output.seek(0) | |
excel_base64 = base64.b64encode(output.read()).decode('utf-8') | |
download_link = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{excel_base64}" download="categorized_results.xlsx">Download Excel file</a>' | |
return output_html, download_link | |
with gr.Blocks() as demo: | |
gr.Markdown("# 📊 Automatic proposal categorization") | |
gr.Markdown("This tool allows you to categorize proposals. It's primary intent is to do that for <a href='https://decidim.org' target='_blank'>Decidim</a> (open source participatory democracy framework) proposals, but you can use it for any proposals you want.") | |
gr.Markdown("<i>The tool was built by the team at <a href='https://opensourcepolitics.eu' target='_blank'>Open Source Politics</a>.</i>") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### 1st option : Upload an Excel file with one column of proposals") | |
file_upload = gr.File(label="📁 Upload the Excel file", file_types=["xlsx"]) | |
with gr.Column(): | |
gr.Markdown("### 2nd option : Classify Decidim proposals") | |
url_input = gr.Textbox(label="🌐 Proposal component URL", placeholder="Enter the URL here") | |
language_select = gr.Dropdown(choices=['English', 'Spanish', 'French'], label="Select Language", value='English') | |
categories_input = gr.Textbox(label="🏷️ Enter categories (separated by commas)", placeholder="for example: Education, Health, Environnement, Other") | |
allow_multiple_categories = gr.Checkbox(label="✅ Allow more than one category per proposal", value=True) | |
submit_button = gr.Button("🚀 Find the categories", variant="primary") | |
output_html = gr.HTML(label="📋 Results") | |
output_download = gr.HTML(label="📥 Download results in XLSX") | |
submit_button.click( | |
fn=lambda file_upload, url_input, categories_input, allow_multiple_categories, language_select: process_inputs( | |
file_upload, url_input, categories_input, allow_multiple_categories, language_select | |
), | |
inputs=[file_upload, url_input, categories_input, allow_multiple_categories, language_select], | |
outputs=[output_html, output_download] | |
) | |
gr.Markdown("### Instructions:") | |
gr.Markdown("1. Excel Upload: The Excel file should contain the proposals listed in the first column without any header.") | |
gr.Markdown("2. Proposals Component URL: Provide the full URL of a proposals component page where the proposals are listed.") | |
gr.Markdown("3. Categories: Enter the categories separated by commas. For example, Education, Health, Environment.") | |
gr.Markdown("4. Multiple Categories: Check this box if you want to allow proposals to be categorized into more than one category.") | |
demo.launch() |