Spaces:

MarMont
/

MARITESS

Sleeping

App Files Files Community

MarMont commited on Oct 15, 2023

Commit

6dba7a5

1 Parent(s): 397900b

compile all lda

Browse files

Files changed (2) hide show

app.py +62 -71
appv1.py +559 -0

app.py CHANGED Viewed

@@ -125,7 +125,64 @@ def tokenize(text):
     return tokens
-def cleaning(df):
     df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
     # Apply the function above and get tweets free of emoji's
@@ -184,29 +241,6 @@ def cleaning(df):
     # Apply tokenizer
     df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
-def split_corpus(corpus, n):
-    for i in range(0, len(corpus), n):
-        corpus_split = corpus
-        yield corpus_split[i:i + n]
-def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1):
-    coherence_values = []
-    model_list = []
-    for num_topics in range(start, limit, step):
-        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
-                                                num_topics=num_topics,
-                                                random_state=100,
-                                                chunksize=200,
-                                                passes=10,
-                                                per_word_topics=True,
-                                                id2word=id2word)
-        model_list.append(model)
-        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
-        coherence_values.append(coherencemodel.get_coherence())
-    return model_list, coherence_values
-def base_lda():
     # Create a id2word dictionary
     global id2word
     id2word = Dictionary(df['lemma_tokens'])
@@ -253,24 +287,6 @@ def base_lda():
     global num_topics
     num_topics = coherence_averages.index(k_max) + 2
-def compute_coherence_values2(corpus, dictionary, k, a, b):
-    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
-        id2word=id2word,
-        num_topics=num_topics,
-        random_state=100,
-        chunksize=200,
-        passes=10,
-        alpha=a,
-        eta=b,
-        per_word_topics=True)
-    coherence_model_lda = CoherenceModel(model=lda_model,
-        texts=df['lemma_tokens'],
-        dictionary=id2word,
-        coherence='c_v')
-    return coherence_model_lda.get_coherence()
-def hyperparameter_optimization():
     grid = {}
     grid['Validation_Set'] = {}
@@ -337,21 +353,9 @@ def hyperparameter_optimization():
         per_word_topics=True)
     coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
-                                     coherence='c_v')
     coherence_lda = coherence_model_lda.get_coherence()
-    return coherence_lda
-def assignMaxTopic(l):
-    maxTopic = max(l,key=itemgetter(1))[0]
-    return maxTopic
-def assignTopic(l):
-    topics = []
-    for x in l:
-        topics.append(x[0])
-def topic_assignment(df):
     lda_topics = lda_model_final.show_topics(num_words=10)
     topics = []
@@ -371,16 +375,6 @@ def topic_assignment(df):
         topic_clusters.append(df[df['max_topic'].isin(([i]))])
         topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
-def get_topic_value(row, i):
-    if len(row) == 1:
-        return row[0][1]
-    else:
-        try:
-            return row[i][1]
-        except Exception as e:
-            print(e)
-def reprsentative_tweets():
     global top_tweets
     top_tweets = []
     for i in range(len(topic_clusters)):
@@ -394,6 +388,7 @@ def reprsentative_tweets():
         top_tweets.append(rep_tweets[:5])
         # print('Topic ', i)
         # print(rep_tweets[:5])
     return top_tweets
 def topic_summarization(topic_groups):
@@ -521,14 +516,10 @@ def main(dataset, model):
         print(dataset)
         place_data = str(scrape(keyword_list))
     print(df)
-    cleaning(df)
     print(df)
     if model == 'LDA':
-        base_lda()
-        coherence = hyperparameter_optimization()
-        topic_assignment(df)
-        top_tweets = reprsentative_tweets()
     else:
         base_bertopic()
         optimized_bertopic()

     return tokens
+def split_corpus(corpus, n):
+    for i in range(0, len(corpus), n):
+        corpus_split = corpus
+        yield corpus_split[i:i + n]
+def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1):
+    coherence_values = []
+    model_list = []
+    for num_topics in range(start, limit, step):
+        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                                num_topics=num_topics,
+                                                random_state=100,
+                                                chunksize=200,
+                                                passes=10,
+                                                per_word_topics=True,
+                                                id2word=id2word)
+        model_list.append(model)
+        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
+        coherence_values.append(coherencemodel.get_coherence())
+    return model_list, coherence_values
+def compute_coherence_values2(corpus, dictionary, k, a, b):
+    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+        id2word=id2word,
+        num_topics=num_topics,
+        random_state=100,
+        chunksize=200,
+        passes=10,
+        alpha=a,
+        eta=b,
+        per_word_topics=True)
+    coherence_model_lda = CoherenceModel(model=lda_model,
+        texts=df['lemma_tokens'],
+        dictionary=id2word,
+        coherence='c_v')
+    return coherence_model_lda.get_coherence()
+def assignMaxTopic(l):
+    maxTopic = max(l,key=itemgetter(1))[0]
+    return maxTopic
+def assignTopic(l):
+    topics = []
+    for x in l:
+        topics.append(x[0])
+def get_topic_value(row, i):
+    if len(row) == 1:
+        return row[0][1]
+    else:
+        try:
+            return row[i][1]
+        except Exception as e:
+            print(e)
+def full_lda():
     df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
     # Apply the function above and get tweets free of emoji's
     # Apply tokenizer
     df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
     # Create a id2word dictionary
     global id2word
     id2word = Dictionary(df['lemma_tokens'])
     global num_topics
     num_topics = coherence_averages.index(k_max) + 2
     grid = {}
     grid['Validation_Set'] = {}
         per_word_topics=True)
     coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
+                                        coherence='c_v')
     coherence_lda = coherence_model_lda.get_coherence()
     lda_topics = lda_model_final.show_topics(num_words=10)
     topics = []
         topic_clusters.append(df[df['max_topic'].isin(([i]))])
         topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
     global top_tweets
     top_tweets = []
     for i in range(len(topic_clusters)):
         top_tweets.append(rep_tweets[:5])
         # print('Topic ', i)
         # print(rep_tweets[:5])
     return top_tweets
 def topic_summarization(topic_groups):
         print(dataset)
         place_data = str(scrape(keyword_list))
     print(df)
     print(df)
     if model == 'LDA':
+        top_tweets = full_lda()
     else:
         base_bertopic()
         optimized_bertopic()

appv1.py ADDED Viewed

	@@ -0,0 +1,559 @@

+import pandas as pd
+import tweepy
+import re
+import emoji
+import spacy
+import gensim
+import json
+import string
+from spacy.tokenizer import Tokenizer
+from gensim.parsing.preprocessing import STOPWORDS as SW
+from wordcloud import STOPWORDS
+from gensim.corpora import Dictionary
+from gensim.models.coherencemodel import CoherenceModel
+from pprint import pprint
+import numpy as np
+import tqdm
+from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
+import torch
+from transformers import T5ForConditionalGeneration,T5Tokenizer
+from googletrans import Translator
+from bertopic import BERTopic
+from umap import UMAP
+from sklearn.feature_extraction.text import CountVectorizer
+from operator import itemgetter
+import gradio as gr
+global df
+bearer_token = 'AAAAAAAAAAAAAAAAAAAAACEigwEAAAAACoP8KHJYLOKCL4OyB9LEPV00VB0%3DmyeDROUvw4uipHwvbPPfnTuY0M9ORrLuXrMvcByqZhwo3SUc4F'
+client = tweepy.Client(bearer_token=bearer_token)
+nlp = spacy.load('en_core_web_lg')
+print('hi')
+def scrape(keywords):
+    query = keywords + ' (lang:en OR lang:tl) -is:retweet'
+    max_results = 100
+    tweet_fields=['geo', 'id', 'lang', 'created_at']
+    expansions=['geo.place_id']
+    place_fields = ['contained_within', 'country', 'country_code', 'full_name', 'geo', 'id', 'name', 'place_type']
+    response = client.search_recent_tweets(
+        query=query,
+        max_results=max_results,
+        tweet_fields=tweet_fields,
+        expansions=expansions,
+        place_fields=place_fields
+    )
+    tweets = []
+    for x in response[0]:
+        tweets.append(str(x))
+    place_data = response[1]
+    df = pd.DataFrame(tweets, columns=['tweet'])
+    return place_data
+def get_example(dataset):
+    df = pd.read_csv(dataset + '.csv')
+    return df
+def give_emoji_free_text(text):
+    """
+    Removes emoji's from tweets
+    Accepts:
+        Text (tweets)
+    Returns:
+        Text (emoji free tweets)
+    """
+    emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
+    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
+    return clean_text
+def url_free_text(text):
+    '''
+    Cleans text from urls
+    '''
+    text = re.sub(r'http\S+', '', text)
+    return text
+def get_lemmas(text):
+        '''Used to lemmatize the processed tweets'''
+        lemmas = []
+        doc = nlp(text)
+        for token in doc:
+            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
+                lemmas.append(token.lemma_)
+        return lemmas
+# Tokenizer function
+def tokenize(text):
+    """
+    Parses a string into a list of semantic units (words)
+    Args:
+        text (str): The string that the function will tokenize.
+    Returns:
+        list: tokens parsed out
+    """
+    # Removing url's
+    pattern = r"http\S+"
+    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
+    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
+    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
+    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
+    # tokens = re.sub('@*!*$*', '', text) # Remove @ ! $
+    tokens = tokens.strip(',') # TESTING THIS LINE
+    tokens = tokens.strip('?') # TESTING THIS LINE
+    tokens = tokens.strip('!') # TESTING THIS LINE
+    tokens = tokens.strip("'") # TESTING THIS LINE
+    tokens = tokens.strip(".") # TESTING THIS LINE
+    tokens = tokens.lower().split() # Make text lowercase and split it
+    return tokens
+def cleaning(df):
+    df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
+    # Apply the function above and get tweets free of emoji's
+    call_emoji_free = lambda x: give_emoji_free_text(x)
+    # Apply `call_emoji_free` which calls the function to remove all emoji's
+    df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)
+    #Create a new column with url free tweets
+    df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)
+    f = open('stopwords-tl.json')
+    tlStopwords = json.loads(f.read())
+    stopwords = set(STOPWORDS)
+    stopwords.update(tlStopwords)
+    stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi'])
+    # Tokenizer
+    tokenizer = Tokenizer(nlp.vocab)
+    # Custom stopwords
+    custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
+    # Customize stop words by adding to the default list
+    STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
+    # ALL_STOP_WORDS = spacy + gensim + wordcloud
+    ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)
+    tokens = []
+    STOP_WORDS.update(stopwords)
+    for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
+        doc_tokens = []
+        for token in doc:
+            if token.text.lower() not in STOP_WORDS:
+                doc_tokens.append(token.text.lower())
+        tokens.append(doc_tokens)
+    # Makes tokens column
+    df['tokens'] = tokens
+    # Make tokens a string again
+    df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]
+    df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)
+    # Make lemmas a string again
+    df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
+    # Apply tokenizer
+    df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
+def split_corpus(corpus, n):
+    for i in range(0, len(corpus), n):
+        corpus_split = corpus
+        yield corpus_split[i:i + n]
+def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1):
+    coherence_values = []
+    model_list = []
+    for num_topics in range(start, limit, step):
+        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                                num_topics=num_topics,
+                                                random_state=100,
+                                                chunksize=200,
+                                                passes=10,
+                                                per_word_topics=True,
+                                                id2word=id2word)
+        model_list.append(model)
+        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
+        coherence_values.append(coherencemodel.get_coherence())
+    return model_list, coherence_values
+def base_lda():
+    # Create a id2word dictionary
+    global id2word
+    id2word = Dictionary(df['lemma_tokens'])
+    # Filtering Extremes
+    id2word.filter_extremes(no_below=2, no_above=.99)
+    # Creating a corpus object
+    global corpus
+    corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
+    global corpus_og
+    corpus_og = [id2word.doc2bow(d) for d in df['lemma_tokens']]
+    corpus_split = corpus
+    split_corpus(corpus_split, 5)
+    global coherence
+    coherence = 'c_v'
+    coherence_averages = [0] * 8
+    for i in range(5):
+        training_corpus = corpus_split
+        training_corpus.remove(training_corpus[i])
+        print(training_corpus[i])
+        model_list, coherence_values = compute_coherence_values_base_lda(dictionary=id2word, corpus=training_corpus,
+                                                            texts=df['lemma_tokens'],
+                                                            start=2,
+                                                            limit=10,
+                                                            step=1,
+                                                            coherence=coherence)
+        for j in range(len(coherence_values)):
+            coherence_averages[j] += coherence_values[j]
+        limit = 10; start = 2; step = 1;
+        x = range(start, limit, step)
+    coherence_averages = [x / 5 for x in coherence_averages]
+    if coherence == 'c_v':
+        k_max = max(coherence_averages)
+    else:
+        k_max = min(coherence_averages, key=abs)
+    global num_topics
+    num_topics = coherence_averages.index(k_max) + 2
+def compute_coherence_values2(corpus, dictionary, k, a, b):
+    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+        id2word=id2word,
+        num_topics=num_topics,
+        random_state=100,
+        chunksize=200,
+        passes=10,
+        alpha=a,
+        eta=b,
+        per_word_topics=True)
+    coherence_model_lda = CoherenceModel(model=lda_model,
+        texts=df['lemma_tokens'],
+        dictionary=id2word,
+        coherence='c_v')
+    return coherence_model_lda.get_coherence()
+def hyperparameter_optimization():
+    grid = {}
+    grid['Validation_Set'] = {}
+    min_topics = 1
+    max_topics = 10
+    step_size = 1
+    topics_range = range(min_topics, max_topics, step_size)
+    alpha = [0.05, 0.1, 0.5, 1, 5, 10]
+    # alpha.append('symmetric')
+    # alpha.append('asymmetric')
+    beta = [0.05, 0.1, 0.5, 1, 5, 10]
+    # beta.append('symmetric')
+    num_of_docs = len(corpus_og)
+    corpus_sets = [gensim.utils.ClippedCorpus(corpus_og, int(num_of_docs*0.75)),
+                corpus_og]
+    corpus_title = ['75% Corpus', '100% Corpus']
+    model_results = {'Validation_Set': [],
+                    'Alpha': [],
+                    'Beta': [],
+                    'Coherence': []
+                    }
+    if 1 == 1:
+        pbar = tqdm.tqdm(total=540)
+    for i in range(len(corpus_sets)):
+        for a in alpha:
+            for b in beta:
+                cv = compute_coherence_values2(corpus=corpus_sets[i],
+                                            dictionary=id2word,
+                                            k=num_topics,
+                                            a=a,
+                                            b=b)
+                model_results['Validation_Set'].append(corpus_title[i])
+                model_results['Alpha'].append(a)
+                model_results['Beta'].append(b)
+                model_results['Coherence'].append(cv)
+            pbar.update(1)
+    pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
+    pbar.close()
+    params_df = pd.read_csv('lda_tuning_results_new.csv')
+    params_df = params_df[params_df.Validation_Set == '75% Corpus']
+    params_df.reset_index(inplace=True)
+    params_df = params_df.replace(np.inf, -np.inf)
+    max_params = params_df.loc[params_df['Coherence'].idxmax()]
+    max_coherence = max_params['Coherence']
+    max_alpha = max_params['Alpha']
+    max_beta = max_params['Beta']
+    max_validation_set = max_params['Validation_Set']
+    global lda_model_final
+    lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus_og,
+        id2word=id2word,
+        num_topics=num_topics,
+        random_state=100,
+        chunksize=200,
+        passes=10,
+        alpha=max_alpha,
+        eta=max_beta,
+        per_word_topics=True)
+    coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
+                                     coherence='c_v')
+    coherence_lda = coherence_model_lda.get_coherence()
+    return coherence_lda
+def assignMaxTopic(l):
+    maxTopic = max(l,key=itemgetter(1))[0]
+    return maxTopic
+def assignTopic(l):
+    topics = []
+    for x in l:
+        topics.append(x[0])
+def topic_assignment(df):
+    lda_topics = lda_model_final.show_topics(num_words=10)
+    topics = []
+    filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
+    for topic in lda_topics:
+        topics.append(preprocess_string(topic[1], filters))
+    df['topic'] = [sorted(lda_model_final[corpus_og][text][0]) for text in range(len(df['original_tweets']))]
+    df = df[df['topic'].map(lambda d: len(d)) > 0]
+    df['max_topic'] = df['topic'].map(lambda row: assignMaxTopic(row))
+    global topic_clusters
+    topic_clusters = []
+    for i in range(num_topics):
+        topic_clusters.append(df[df['max_topic'].isin(([i]))])
+        topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
+def get_topic_value(row, i):
+    if len(row) == 1:
+        return row[0][1]
+    else:
+        try:
+            return row[i][1]
+        except Exception as e:
+            print(e)
+def reprsentative_tweets():
+    global top_tweets
+    top_tweets = []
+    for i in range(len(topic_clusters)):
+        tweets = df.loc[df['max_topic'] == i]
+        tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i))
+        # tweets['topic'] = [row[i][1] for row in tweets['topic']]
+        tweets_sorted = tweets.sort_values('topic', ascending=False)
+        tweets_sorted.drop_duplicates(subset=['original_tweets'])
+        rep_tweets = tweets_sorted['original_tweets']
+        rep_tweets = [*set(rep_tweets)]
+        top_tweets.append(rep_tweets[:5])
+        # print('Topic ', i)
+        # print(rep_tweets[:5])
+    return top_tweets
+def topic_summarization(topic_groups):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
+    tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
+    model = model.to(device)
+    translator = Translator()
+    headlines = []
+    for i in range(len(topic_groups)):
+        tweets = " ".join(topic_groups[i])
+        # print(tweets)
+        out = translator.translate(tweets, dest='en')
+        text = out.text
+        # print(tweets)
+        max_len = 256
+        encoding = tokenizer.encode_plus(text, return_tensors = "pt")
+        input_ids = encoding["input_ids"].to(device)
+        attention_masks = encoding["attention_mask"].to(device)
+        beam_outputs = model.generate(
+            input_ids = input_ids,
+            attention_mask = attention_masks,
+            max_length = 64,
+            num_beams = 3,
+            early_stopping = True,
+        )
+        result = tokenizer.decode(beam_outputs[0])
+        headlines += "Topic " + str(i) + " " + result
+    return headlines
+def compute_coherence_value_bertopic(topic_model):
+    topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]
+    coherence_model = CoherenceModel(topics=topic_words,
+                                    texts=df['lemma_tokens'],
+                                    corpus=corpus,
+                                    dictionary=id2word,
+                                    coherence=coherence)
+    coherence_score = coherence_model.get_coherence()
+    return coherence_score
+def base_bertopic():
+    df['lemma_tokens_string'] = df['lemma_tokens'].apply(lambda x: ' '.join(x))
+    global id2word
+    id2word = Dictionary(df['lemma_tokens'])
+    global corpus
+    corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
+    global umap_model
+    umap_model = UMAP(n_neighbors=15,
+        n_components=5,
+        min_dist=0.0,
+        metric='cosine',
+        random_state=100)
+    base_topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)
+    topics, probabilities = base_topic_model.fit_transform(df['lemma_tokens_string'])
+    try:
+        print(compute_coherence_value_bertopic(base_topic_model))
+    except:
+        print('Unable to generate meaningful topics (Base BERTopic model)')
+def optimized_bertopic():
+    vectorizer_model = CountVectorizer(max_features=1_000, stop_words="english")
+    optimized_topic_model = BERTopic(umap_model=umap_model,
+            language="multilingual",
+            n_gram_range=(1, 3),
+            vectorizer_model=vectorizer_model,
+            calculate_probabilities=True)
+    topics, probabilities = optimized_topic_model.fit_transform(df['lemma_tokens_string'])
+    try:
+        print(compute_coherence_value_bertopic(optimized_topic_model))
+    except:
+        print('Unable to generate meaningful topics, base BERTopic model if possible')
+    rep_docs = optimized_topic_model.representative_docs_
+    global top_tweets
+    top_tweets = []
+    for topic in rep_docs:
+        if topic == -1:
+            print('test')
+            continue
+        topic_docs = rep_docs.get(topic)
+        tweets = []
+        for doc in topic_docs:
+            index = df.isin([doc]).any(axis=1).idxmax()
+            # print(index)
+            tweets.append(df.loc[index, 'original_tweets'])
+            print(tweets)
+        top_tweets.append(tweets)
+global examples
+def main(dataset, model):
+    global df
+    examples = [ "katip,katipunan",
+        "bgc,bonifacio global city",
+        "pobla,poblacion",
+        "cubao",
+        "taft"
+    ]
+    keyword_list = dataset.split(',')
+    if len(keyword_list) > 1:
+        keywords = '(' + ' OR '.join(keyword_list) + ')'
+    else:
+        keywords = keyword_list[0]
+    if dataset in examples:
+        df = get_example(keywords)
+        place_data = 'test'
+    else:
+        print(dataset)
+        place_data = str(scrape(keyword_list))
+    print(df)
+    cleaning(df)
+    print(df)
+    if model == 'LDA':
+        base_lda()
+        coherence = hyperparameter_optimization()
+        topic_assignment(df)
+        top_tweets = reprsentative_tweets()
+    else:
+        base_bertopic()
+        optimized_bertopic()
+    headlines = topic_summarization(top_tweets)
+    headlines = '\n'.join(str(h) for h in headlines)
+    return place_data, headlines
+iface = gr.Interface(fn=main,
+                    inputs=[gr.Dropdown(["katip,katipunan",
+                                        "bgc,bonifacio global city",
+                                        "cubao",
+                                        "taft",
+                                        "pobla,poblacion"],
+                                        label="Dataset"),
+                            gr.Dropdown(["LDA",
+                                        "BERTopic"],
+                                        label="Model")
+                            ],
+                    # examples=examples,
+                    outputs=["text",
+                            "text"])
+iface.launch()