Spaces:

tomascufaro
/

keyword_classification

Sleeping

File size: 5,513 Bytes

ad1fbe3
 
 
 
 
 
47b7108
8fd5591
0911828
ad1fbe3
 
 
 
 
 
 
 
 
 
 
 
b8ae7f5
ad1fbe3
 
 
 
 
 
 
b8ae7f5
 
 
 
 
 
 
 
 
47b7108
ad1fbe3
b8ae7f5
ad1fbe3
b8ae7f5
ad1fbe3
 
 
 
 
 
 
b8ae7f5
ad1fbe3
b8ae7f5
ad1fbe3
b8ae7f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad1fbe3
 
b8ae7f5
ad1fbe3
 
 
 
 
 
 
 
 
 
b8ae7f5
 
ad1fbe3
 
 
 
b8ae7f5
 
ad1fbe3
 
 
 
 
 
b8ae7f5
ad1fbe3

import pandas as pd    # for data manipulation (pip install pandas)
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import gradio as gr
import os 
import collections

# Schema
schema = {
    "properties": {
        "keyword": {"type": "string"},
        "category": {"type": "string"},
    },
    "required": ["keyword", "category"],
}

# Input 
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert marketing researcher"),
        ("human", """{prompt_input}.
         Here you have the categories splitted by coma: {categories}.
         and Here you have the keywords splitted by coma: {keywords}."""),
        ("human", "Tip: Make sure to answer in the correct format and DO NOT leave keywords without category and DO NOT skip keywords. Please categorize all the keywords that I give you, each keyword must have just one and only one category."),
    ]
)

prompt_no_cat = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert marketing researcher"),
        ("human", """{prompt_input}.
         and Here you have the keywords splitted by coma: {keywords}."""),
        ("human", "Tip: Make sure to answer in the correct format and DO NOT leave keywords without category and DO NOT skip keywords. Please categorize all the keywords that I give you, each keyword must have just one and only one category."),
    ]
)

llm = ChatOpenAI(temperature=0, openai_api_key=os.getenv("OpenAI_APIKEY"), model="gpt-3.5-turbo")
chain = create_extraction_chain(schema, llm, prompt, verbose=1)
chain_no_cat = create_extraction_chain(schema, llm, prompt_no_cat, verbose=1)

def run_chain(input_prompt, keywords_file, categories_file=None, batch_size=50):
    results = []
    batch_size = batch_size
    index = 0
    try:
        keywords = pd.read_csv(keywords_file.name)
    except:
        keywords = pd.read_excel(keywords_file.name)
    if categories_file != None:
        try:
            categories = pd.read_csv(categories_file.name)
        except:
            categories = pd.read_excel(categories_file.name)
        categories = list(categories[categories.columns[0]].values)
        keywords = list(keywords[keywords.columns[0]].values)
        while index < len(keywords):
            try:
                batch = keywords[index:index+batch_size]
            except:
                batch = keywords[index:]
            try:
                result = chain.run({'prompt_input':input_prompt, 'categories':','.join(categories), 'keywords':','.join(batch)})
            except Exception as E:
                print('this batch did not worked from {} to {}'.format(index, index + batch_size))
                print(E)
                result = []
            results += result
            index += batch_size
            results_to_csv(results)
            #print((index, batch_size, len(keywords)))
        return results, 'themes_results.csv'
    else:
        keywords = list(keywords[keywords.columns[0]].values)
        batch_size = len(keywords)
        while index < len(keywords):
            try:
                batch = keywords[index:index+batch_size]
            except:
                batch = keywords[index:]
            try:
                result = chain_no_cat.run({'prompt_input':input_prompt, 'keywords':','.join(batch)})
            except Exception as E:
                print('this batch did not worked from {} to {}'.format(index, index + batch_size))
                print(E)
                result = []
            results += result
            index += batch_size
            results_to_csv(results)
            #print((index, batch_size, len(keywords)))
        return results, 'themes_results.csv'

def results_to_csv(results):
    super_dict = collections.defaultdict(list)
    for d in results:
        for k, v in d.items():  # d.items() in Python 3+
            super_dict[k].append(v)
    pd.DataFrame(super_dict).to_csv('themes_results.csv', index=False)


with gr.Blocks() as demo:
    prompt_input = gr.Text("""I need your help to analyze and categorize the provided list of keywords
into the appropriate categories. 
The goal is to understand information demand on search engines within this industry. Each keyword represents a search and it should have a relation with the category. 
Extract each keyword and assign the best category among the given categories. Return every keyword with the relative category in pairs.
If the categories are not given """)
    gr.Markdown("Upload CSV or xlsx with keywords: Just a csv  with all the keywords in one column. Should have a header")
    keywords_file = gr.File(file_types=['csv', 'xlsx'], label='keywords')
    gr.Markdown("Upload CSV or xlsx with categories: Just a csv with all the keywords in one column. Should have a header")
    categories_file = gr.File(file_types=['.csv', '.xlsx'], label='categories')
    btn = gr.Button(value="Run with categories")
    btn2 = gr.Button(value="Run without categories")
    txt_3 = gr.Textbox(value="", label="Output")
    output_file = gr.File(label="Output File", 
                file_count="single", 
                file_types=["", ".", ".csv",".xls",".xlsx"])
    
    btn.click(run_chain, inputs=[prompt_input, keywords_file, categories_file], outputs=[txt_3, output_file])
    btn2.click(run_chain, inputs=[prompt_input, keywords_file], outputs=[txt_3, output_file])
demo.launch()