Spaces:

FoodDesert
/

Prompt_Squirrel

Running

App Files Files Community

FoodDesert commited on Apr 1, 2024

Commit

e7aeeed

verified ·

1 Parent(s): cb15d1f

Upload app.py

Browse files

Files changed (1) hide show

app.py +60 -22

app.py CHANGED Viewed

@@ -130,16 +130,17 @@ parser = Lark(grammar, start='start')
 # Function to extract tags
 def extract_tags(tree):
-    tags = []
     def _traverse(node):
         if isinstance(node, Token) and node.type == '__ANON_1':
-            tags.append(node.value.strip())
         elif not isinstance(node, Token):
             for child in node.children:
                 _traverse(child)
     _traverse(tree)
-    return tags
 special_tags = ["score:0", "score:1", "score:2", "score:3", "score:4", "score:5", "score:6", "score:7", "score:8", "score:9"]
@@ -341,7 +342,7 @@ def geometric_mean_given_words(target_word, context_words, co_occurrence_matrix,
 def create_html_tables_for_tags(tag, result, tag2count, tag2idwiki):
     # Wrap the tag part in a <span> with styles for bold and larger font
-    html_str = f"<div style='display: inline-block; margin: 10px; vertical-align: top;'><table><thead><tr><th colspan='3' style='text-align: center; padding-bottom: 10px;'><span style='font-weight: bold; font-size: 20px;'>{tag}</span></th></tr></thead><tbody><tr style='border-bottom: 1px solid #000;'><th>Corrected Tag</th><th>Similarity</th><th>Count</th></tr>"
     # Loop through the results and add table rows for each
     for word, sim in result:
         word_with_underscores = word.replace(' ', '_')
@@ -404,24 +405,35 @@ def find_similar_tags(test_tags, similarity_weight, allow_nsfw_tags):
     if not hasattr(find_similar_tags, "tag2idwiki"):
         find_similar_tags.tag2idwiki = build_tag_id_wiki_dict()
-    transformed_tags = [tag.replace(' ', '_') for tag in test_tags]
     # Find similar tags and prepare data for tables
     html_content = "<div style='display: inline-block; margin: 20px; text-align: center;'>"
     html_content += "<h1>Unknown Tags</h1>"  # Heading for the table
     tags_added = False
-    for tag in test_tags:
-        if tag in special_tags:
             continue
-        modified_tag_for_search = tag.replace(' ','_')
         similar_words = find_similar_tags.fasttext_small_model.most_similar(modified_tag_for_search, topn = 100)
         result, seen = [], set(transformed_tags)
         if modified_tag_for_search in find_similar_tags.tag2aliases:
-            if tag in find_similar_tags.tag2aliases and "_" in tag:   #Implicitly tell the user that they should get rid of the underscore
                 result.append(modified_tag_for_search.replace('_',' '), 1)
-                seen.add(tag)
             else:   #The user correctly did not put underscores in their tag
                 continue
         else:
@@ -444,36 +456,60 @@ def find_similar_tags(test_tags, similarity_weight, allow_nsfw_tags):
         #Adjust score based on context
         for i in range(len(result)):
             word, score = result[i]  # Unpack the tuple
-            geometric_mean = geometric_mean_given_words(word.replace(' ','_'), [context_tag for context_tag in transformed_tags if context_tag != word and context_tag != tag], conditional_co_occurrence_matrix, conditional_vocabulary, conditional_doc_count, smoothing_value=conditional_smoothing)
             adjusted_score = (similarity_weight * geometric_mean) + ((1-similarity_weight)*score)  # Apply the adjustment function
             result[i] = (word, adjusted_score)  # Update the tuple with the adjusted score
             #print(word, score, geometric_mean, adjusted_score)
         result = sorted(result, key=lambda x: x[1], reverse=True)[:10]
-        html_content += create_html_tables_for_tags(tag, result, find_similar_tags.tag2count, find_similar_tags.tag2idwiki)
         tags_added=True
     # If no tags were processed, add a message
     if not tags_added:
         html_content = create_html_placeholder(title="Unknown Tags")
-    return html_content  # Return list of lists for Dataframe
-def find_similar_artists(new_tags_string, top_n, similarity_weight, allow_nsfw_tags):
     try:
-        new_tags_string = new_tags_string.lower()
         new_tags_string, removed_tags = remove_special_tags(new_tags_string)
         # Parse the prompt
         parsed = parser.parse(new_tags_string)
         # Extract tags from the parsed tree
         new_image_tags = extract_tags(parsed)
-        new_image_tags = [tag.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip() for tag in new_image_tags]
         ###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))   #We may want this line again later.  These are the tags that were not used to calculate the artists list.
-        unseen_tags_data = find_similar_tags(new_image_tags, similarity_weight, allow_nsfw_tags)
-        X_new_image = vectorizer.transform([','.join(new_image_tags + removed_tags)])
         similarities = cosine_similarity(X_new_image, X_artist)[0]
         top_artist_indices = np.argsort(similarities)[-(top_n + 1):][::-1]
@@ -490,7 +526,7 @@ def find_similar_artists(new_tags_string, top_n, similarity_weight, allow_nsfw_t
                 image_galleries.append(baseline)  # Add baseline as its own gallery item
                 image_galleries.append(artists)  # Extend the list with artist tuples
-        return (unseen_tags_data, top_artists_str, dynamic_prompts_formatted_artists, *image_galleries) #image_galleries[0], image_galleries[1] DOES work.  Find a generic alternative.
     except ParseError as e:
         return [], "Parse Error: Check for mismatched parentheses or something", "", None, None
@@ -504,6 +540,8 @@ with gr.Blocks() as app:
             similarity_weight = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="Similarity weight")
             num_artists = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of artists")
             allow_nsfw = gr.Checkbox(label="Allow NSFW Tags", value=False)
     with gr.Row():
         with gr.Column(scale=1):
             top_artists = gr.HTML(label="Top Artists", value=create_html_placeholder(title="Top Artists"))
@@ -521,7 +559,7 @@ with gr.Blocks() as app:
     submit_button.click(
         find_similar_artists,
         inputs=[image_tags, num_artists, similarity_weight, allow_nsfw],
-        outputs=[unseen_tags, top_artists, dynamic_prompts] + galleries
     )
     gr.Markdown(faq_content)

 # Function to extract tags
 def extract_tags(tree):
+    tags_with_positions = []
     def _traverse(node):
         if isinstance(node, Token) and node.type == '__ANON_1':
+            tag_position = node.start_pos
+            tag_text = node.value.strip()
+            tags_with_positions.append((tag_text, tag_position))
         elif not isinstance(node, Token):
             for child in node.children:
                 _traverse(child)
     _traverse(tree)
+    return tags_with_positions
 special_tags = ["score:0", "score:1", "score:2", "score:3", "score:4", "score:5", "score:6", "score:7", "score:8", "score:9"]
 def create_html_tables_for_tags(tag, result, tag2count, tag2idwiki):
     # Wrap the tag part in a <span> with styles for bold and larger font
+    html_str = f"<div style='display: inline-block; margin: 20px; vertical-align: top;'><table><thead><tr><th colspan='3' style='text-align: center; padding-bottom: 10px;'><span style='font-weight: bold; font-size: 20px;'>{tag}</span></th></tr></thead><tbody><tr style='border-bottom: 1px solid #000;'><th>Corrected Tag</th><th>Similarity</th><th>Count</th></tr>"
     # Loop through the results and add table rows for each
     for word, sim in result:
         word_with_underscores = word.replace(' ', '_')
     if not hasattr(find_similar_tags, "tag2idwiki"):
         find_similar_tags.tag2idwiki = build_tag_id_wiki_dict()
+    modified_tags = [tag_info['modified_tag'] for tag_info in test_tags]
+    transformed_tags = [tag.replace(' ', '_') for tag in modified_tags]
     # Find similar tags and prepare data for tables
     html_content = "<div style='display: inline-block; margin: 20px; text-align: center;'>"
     html_content += "<h1>Unknown Tags</h1>"  # Heading for the table
     tags_added = False
+    bad_entities = []
+    for tag_info in test_tags:
+        original_tag = tag_info['original_tag']
+        modified_tag = tag_info['modified_tag']
+        start_pos = tag_info['start_pos']
+        end_pos = tag_info['end_pos']
+        print(original_tag, modified_tag, start_pos, end_pos)
+        if modified_tag in special_tags:
             continue
+        modified_tag_for_search = modified_tag.replace(' ','_')
         similar_words = find_similar_tags.fasttext_small_model.most_similar(modified_tag_for_search, topn = 100)
         result, seen = [], set(transformed_tags)
         if modified_tag_for_search in find_similar_tags.tag2aliases:
+            if modified_tag in find_similar_tags.tag2aliases and "_" in modified_tag:   #Implicitly tell the user that they should get rid of the underscore
                 result.append(modified_tag_for_search.replace('_',' '), 1)
+                seen.add(modified_tag)
             else:   #The user correctly did not put underscores in their tag
                 continue
         else:
         #Adjust score based on context
         for i in range(len(result)):
             word, score = result[i]  # Unpack the tuple
+            geometric_mean = geometric_mean_given_words(word.replace(' ','_'), [context_tag for context_tag in transformed_tags if context_tag != word and context_tag != modified_tag], conditional_co_occurrence_matrix, conditional_vocabulary, conditional_doc_count, smoothing_value=conditional_smoothing)
             adjusted_score = (similarity_weight * geometric_mean) + ((1-similarity_weight)*score)  # Apply the adjustment function
             result[i] = (word, adjusted_score)  # Update the tuple with the adjusted score
             #print(word, score, geometric_mean, adjusted_score)
         result = sorted(result, key=lambda x: x[1], reverse=True)[:10]
+        html_content += create_html_tables_for_tags(modified_tag, result, find_similar_tags.tag2count, find_similar_tags.tag2idwiki)
+        bad_entities.append({"entity":"UNKNOWN", "start":start_pos, "end":end_pos})
         tags_added=True
     # If no tags were processed, add a message
     if not tags_added:
         html_content = create_html_placeholder(title="Unknown Tags")
+    return html_content, bad_entities  # Return list of lists for Dataframe
+def build_tag_offsets_dicts(new_image_tags_with_positions):
+    # Structure the data for HighlightedText
+    tag_data = []
+    for tag_text, start_pos in new_image_tags_with_positions:
+        # Modify the tag
+        modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
+        # Calculate the end position based on the original tag length
+        end_pos = start_pos + len(tag_text)
+        # Append the structured data for each tag
+        tag_data.append({
+            "original_tag": tag_text,
+            "start_pos": start_pos,
+            "end_pos": end_pos,
+            "modified_tag": modified_tag
+        })
+    return tag_data
+def find_similar_artists(original_tags_string, top_n, similarity_weight, allow_nsfw_tags):
     try:
+        new_tags_string = original_tags_string.lower()
         new_tags_string, removed_tags = remove_special_tags(new_tags_string)
         # Parse the prompt
         parsed = parser.parse(new_tags_string)
         # Extract tags from the parsed tree
         new_image_tags = extract_tags(parsed)
+        tag_data = build_tag_offsets_dicts(new_image_tags)
         ###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))   #We may want this line again later.  These are the tags that were not used to calculate the artists list.
+        unseen_tags_data, bad_entities = find_similar_tags(tag_data, similarity_weight, allow_nsfw_tags)
+        bad_tags_illustrated_string = {"text":new_tags_string, "entities":bad_entities}
+        modified_tags = [tag_info['modified_tag'] for tag_info in tag_data]
+        X_new_image = vectorizer.transform([','.join(modified_tags + removed_tags)])
         similarities = cosine_similarity(X_new_image, X_artist)[0]
         top_artist_indices = np.argsort(similarities)[-(top_n + 1):][::-1]
                 image_galleries.append(baseline)  # Add baseline as its own gallery item
                 image_galleries.append(artists)  # Extend the list with artist tuples
+        return (unseen_tags_data, bad_tags_illustrated_string, top_artists_str, dynamic_prompts_formatted_artists, *image_galleries) #image_galleries[0], image_galleries[1] DOES work.  Find a generic alternative.
     except ParseError as e:
         return [], "Parse Error: Check for mismatched parentheses or something", "", None, None
             similarity_weight = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="Similarity weight")
             num_artists = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of artists")
             allow_nsfw = gr.Checkbox(label="Allow NSFW Tags", value=False)
+    with gr.Row():
+        bad_tags_illustrated_string = gr.HighlightedText()
     with gr.Row():
         with gr.Column(scale=1):
             top_artists = gr.HTML(label="Top Artists", value=create_html_placeholder(title="Top Artists"))
     submit_button.click(
         find_similar_artists,
         inputs=[image_tags, num_artists, similarity_weight, allow_nsfw],
+        outputs=[unseen_tags, bad_tags_illustrated_string, top_artists, dynamic_prompts] + galleries
     )
     gr.Markdown(faq_content)