File size: 3,070 Bytes
6d509cf
 
 
f14d902
 
 
cc99df6
f84d0bd
 
 
 
 
 
 
f14d902
 
 
 
 
 
b4e5e31
 
f14d902
4b0c842
 
 
cc99df6
 
4b0c842
 
 
 
cc99df6
f84d0bd
cc99df6
 
4b0c842
 
 
 
 
 
f14d902
4b0c842
cc99df6
 
 
 
 
4b0c842
f84d0bd
 
cc99df6
4b0c842
cc99df6
 
 
f14d902
4b0c842
 
f14d902
4b0c842
f14d902
2db548f
cc99df6
 
 
 
2db548f
 
cc99df6
 
 
 
 
 
 
 
f14d902
cc99df6
f14d902
60f22cf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

#Praise Jesus
#Stable version working with Llama but not satisfied with poor output
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import docx2txt
import os

# Authenticate with Hugging Face using an environment variable token if set
hf_token = os.getenv('HF_TOKEN')
if hf_token:
    from huggingface_hub import login
    login(hf_token)

# Whisper model for audio transcription
whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-large")

# LLaMA 3.2 model for text processing
llama_model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(llama_model_id, token=hf_token)
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_id, torch_dtype=torch.bfloat16, token=hf_token)

# Function to generate response using LLaMA 3.2 model
def get_llama_response(input_text):
    # Ensure input is detailed enough
    if len(input_text.split()) < 10:
        return "Please provide a more detailed user story to help generate relevant needs and wants."

    # Define prompt for LLaMA model
    prompt = f"""
    Based on the user story "{input_text}", extract any unarticulated needs and wants.
    Only provide essential needs and wants directly relevant to the given story.
    Do not speculate or over-extrapolate.
    """
    
    # Process the prompt with LLaMA 3.2
    inputs = tokenizer(prompt, return_tensors="pt")
    llama_output = llama_model.generate(**inputs, max_new_tokens=100)
    response_text = tokenizer.decode(llama_output[0], skip_special_tokens=True)

    return response_text

# Main processing function for Gradio interface
def process_input(user_story=None, user_audio=None, user_file=None):
    # Process audio input if provided
    if user_audio:
        transcription = whisper_model(user_audio)["text"]
        user_story = transcription

    # Process file input if provided and if text is empty
    if user_file and not user_story:
        user_story = docx2txt.process(user_file)

    # Ensure there's text to process
    if not user_story:
        return "Please provide a user story, an audio file, or upload a Word file."

    # Generate response with LLaMA 3.2
    llama_response = get_llama_response(user_story)

    return f"LLaMA Output:\n{llama_response}"

# Gradio interface with text, audio, and file inputs
interface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Textbox(label="User Story (Text Input)", placeholder="Enter your user story here..."),
        gr.Audio(type="filepath", label="User Story (Audio Input)"),
        gr.File(label="Upload Word File (.docx)")  # Removed `optional=True`
    ],
    outputs="text",
    title="Multimodal Needs & Wants Extractor",
    description="**Author:** VictorDaniel\n\nEnter a detailed user story or upload an audio/Word file to extract the unarticulated needs and wants.",
    examples=[
        ["The user often speaks about wanting to improve their health but is hesitant to join a gym."]
    ]
)

# Launch the Gradio app
interface.launch()
#app2