Spaces:

A-M-S
/

movie-genre

Runtime error

App Files Files Community

A-M-S commited on Jul 26, 2022

Commit

8fd825b

•

2 Parent(s): 3788106 7496ff3

Merge branch 'main' of https://huggingface.co/spaces/A-M-S/movie-genre

Browse files

Files changed (1) hide show

app.py +78 -56

app.py CHANGED Viewed

@@ -11,9 +11,10 @@ from utility import Utility
 st.title("Movie Genre Predictor")
-st.subheader("Enter the text you'd like to analyze.")
 text = st.text_input('Enter plot of the movie')
-wiki_url = st.text_input("Enter wikipedia url of the movie (Needed for fetching the cast information)")
 model = AutoModelForSequenceClassification.from_pretrained("./checkpoint-36819")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -28,69 +29,90 @@ meta_model = pickle.load(open("models/meta_model","rb"))
 utility = Utility()
 preprocess = Preprocess()
 if st.button("Predict"):
     cast = []
-    if len(wiki_url)!=0:
-            cast_wiki = wikipedia.page(title=wiki_url.split("/")[-1].replace("_"," "), auto_suggest=False).section("Cast")
-            cast_names = [val.split(" as ")[0] for val in cast_wiki.split("\n")]
-            for actor in cast_names[:5]:
                   try:
-                    cast.append(wikipedia.page(title=actor).pageid)
                   except:
-                    search_results = wikipedia.search(actor,results=2)
-                    try:
-                      cast.append(wikipedia.page(title=search_results[0]).pageid)
-                    except:
-                      try:
-                        cast.append(wikipedia.page(title=search_results(actor)[1]).pageid)
-                      except:
-                        pass
-    st.write("Wiki Ids of Top 5 Cast:",cast)
     st.write("Genre: ")
     clean_plot = preprocess.apply(text)
-    # Base Model 1: DistilBERT
-    id2label, label2id, tokenizer, tokenized_plot = utility.tokenize(clean_plot, ["Action","Drama", "Romance", "Comedy", "Thriller"])
-    input_ids = [np.asarray(tokenized_plot['input_ids'])]
-    attention_mask = [np.asarray(tokenized_plot['attention_mask'])]
-    y_pred = model(torch.IntTensor(input_ids), torch.IntTensor(attention_mask))
-    pred = torch.FloatTensor(y_pred['logits'][0])
-    sigmoid = torch.nn.Sigmoid()
-    distilbert_pred = sigmoid(pred.squeeze().cpu())
-    # Base model 2: LR One Vs All
-    cast_features = []
-    for actor in cast:
-        if actor in top_actors:
-            cast_features.append(str(actor))
-    lr_model_pred = lr_model.predict_proba(cast_mlb.transform([cast_features]))
-    # Concatenating Outputs of base models
-    r1 = distilbert_pred[3]
-    r2 = distilbert_pred[1]
-    r3 = distilbert_pred[2]
-    distilbert_pred[1] = r1
-    distilbert_pred[2] = r2
-    distilbert_pred[3] = r3
-    pred1 = distilbert_pred
-    pred2 = lr_model_pred
-    distilbert_pred = pred1.detach().numpy()
-    lr_model_pred = np.array(pred2)[0]
-    concat_features = np.concatenate((lr_model_pred,distilbert_pred))
-    # Meta model 3: LR One Vs All
-    probs = meta_model.predict_proba([concat_features])
-    # Preparing Output
-    out = []
-    id2label = {0:"Action",1:"Comedy",2:"Drama",3:"Romance",4:"Thriller"}
-    i = 0
-    for prob in probs[0]:
-        out.append([id2label[i], prob])
-        i += 1
     st.write(out)

 st.title("Movie Genre Predictor")
 text = st.text_input('Enter plot of the movie')
+st.caption("Either enter Wiki URL or the Cast info of the movie. Cast will be fetched from the Wiki page if cast is not provided")
+wiki_url = st.text_input("Enter Wiki URL of the movie (Needed for fetching the cast information)")
+cast_input = st.text_input("Enter Wiki IDs of the cast (Should be separated by comma)")
 model = AutoModelForSequenceClassification.from_pretrained("./checkpoint-36819")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 utility = Utility()
 preprocess = Preprocess()
+out = []
 if st.button("Predict"):
     cast = []
+    if len(wiki_url)!=0 and len(cast_input)==0:
+            html_page = wikipedia.page(title=wiki_url.split("/")[-1].replace("_"," "), auto_suggest=False).html()
+            cast_wiki = html_page.split(" title=\"Edit section: Cast\">edit</a>")[-1]
+            anchor_tags = cast_wiki.split("<a href=")[1:6]
+            top5_cast_links = [val.split("\"")[1] for val in anchor_tags]
+            for actor in top5_cast_links:
                   try:
+                    cast.append(wikipedia.page(title=actor.split("/")[-1].replace("_"," ")).pageid)
                   except:
+                    pass
+    else:
+            if ", " in cast_input:
+                cast = cast_input.split(", ")
+            else:
+                cast = cast_input.split(",")
+    cast_str = ""
+    for actor in cast:
+            cast_str += actor + ", "
+    st.write("Wiki Ids of Top 5 Cast:",cast_str)
     st.write("Genre: ")
     clean_plot = preprocess.apply(text)
+    # Use Meta Model approach when cast information is available otherwise use DistilBERT model
+    if len(cast)!=0:
+            # Base Model 1: DistilBERT
+            id2label, label2id, tokenizer, tokenized_plot = utility.tokenize(clean_plot, ["Action","Drama", "Romance", "Comedy", "Thriller"])
+            input_ids = [np.asarray(tokenized_plot['input_ids'])]
+            attention_mask = [np.asarray(tokenized_plot['attention_mask'])]
+            y_pred = model(torch.IntTensor(input_ids), torch.IntTensor(attention_mask))
+            pred = torch.FloatTensor(y_pred['logits'][0])
+            sigmoid = torch.nn.Sigmoid()
+            distilbert_pred = sigmoid(pred.squeeze().cpu())
+            # Base model 2: LR One Vs All
+            cast_features = []
+            for actor in cast:
+                if actor in top_actors:
+                    cast_features.append(str(actor))
+            lr_model_pred = lr_model.predict_proba(cast_mlb.transform([cast_features]))
+            # Concatenating Outputs of base models
+            r1 = distilbert_pred[3]
+            r2 = distilbert_pred[1]
+            r3 = distilbert_pred[2]
+            distilbert_pred[1] = r1
+            distilbert_pred[2] = r2
+            distilbert_pred[3] = r3
+            pred1 = distilbert_pred
+            pred2 = lr_model_pred
+            distilbert_pred = pred1.detach().numpy()
+            lr_model_pred = np.array(pred2)[0]
+            concat_features = np.concatenate((lr_model_pred,distilbert_pred))
+            # Meta model 3: LR One Vs All
+            probs = meta_model.predict_proba([concat_features])
+            # Preparing Output
+            id2label = {0:"Action",1:"Comedy",2:"Drama",3:"Romance",4:"Thriller"}
+            i = 0
+            for prob in probs[0]:
+                out.append([id2label[i], prob])
+                i += 1
+    else:
+         id2label, label2id, tokenizer, tokenized_plot = utility.tokenize(clean_plot, ["Action","Drama", "Romance", "Comedy", "Thriller"])
+         input_ids = [np.asarray(tokenized_plot['input_ids'])]
+         attention_mask = [np.asarray(tokenized_plot['attention_mask'])]
+         y_pred = model(torch.IntTensor(input_ids), torch.IntTensor(attention_mask))
+         pred = torch.FloatTensor(y_pred['logits'][0])
+         sigmoid = torch.nn.Sigmoid()
+         probs = sigmoid(pred.squeeze().cpu())
+         i = 0
+         for prob in probs:
+                 out.append([id2label[i], np.asscalar(prob)])
+                 i += 1
     st.write(out)