Spaces:
Sleeping
Sleeping
Version 2.0 second run
Browse files
app.py
CHANGED
|
@@ -123,8 +123,8 @@ embedder_both = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5",
|
|
| 123 |
retriever_embedding_both = InMemoryEmbeddingRetriever(document_store=document_store)
|
| 124 |
retriever_bm25_both = InMemoryBM25Retriever(document_store=document_store)
|
| 125 |
joiner = DocumentJoiner(
|
| 126 |
-
join_mode="reciprocal_rank_fusion",
|
| 127 |
-
weights=[0.5, 0.5]
|
| 128 |
)
|
| 129 |
# Both (Hybrid) pipeline
|
| 130 |
both_pipeline = Pipeline()
|
|
@@ -136,12 +136,14 @@ both_pipeline.connect("embedder.embedding", "retriever_embedding.query_embedding
|
|
| 136 |
both_pipeline.connect("retriever_embedding", "joiner")
|
| 137 |
both_pipeline.connect("retriever_bm25", "joiner")
|
| 138 |
logger.info(f"Both pipeline components: {both_pipeline.graph.nodes.keys()}")
|
|
|
|
| 139 |
# Components for keyword_pipeline
|
| 140 |
retriever_bm25_key = InMemoryBM25Retriever(document_store=document_store)
|
| 141 |
# Keywords pipeline
|
| 142 |
keyword_pipeline = Pipeline()
|
| 143 |
keyword_pipeline.add_component("retriever_bm25", retriever_bm25_key)
|
| 144 |
logger.info(f"Keyword pipeline components: {keyword_pipeline.graph.nodes.keys()}")
|
|
|
|
| 145 |
# Components for semantic_pipeline
|
| 146 |
embedder_sem = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5", normalize_embeddings=True)
|
| 147 |
retriever_embedding_sem = InMemoryEmbeddingRetriever(document_store=document_store)
|
|
@@ -662,7 +664,8 @@ def save_codex_tree(tree):
|
|
| 662 |
api.upload_file(
|
| 663 |
path_or_fileobj=codex_tree_path,
|
| 664 |
path_in_repo=codex_tree_path,
|
| 665 |
-
repo_id="hetzerdj/preternatural-text-ui"
|
|
|
|
| 666 |
)
|
| 667 |
logger.info("Auto-committed codex_tree.json to HF repo")
|
| 668 |
except Exception as e:
|
|
@@ -715,7 +718,48 @@ def render_static_story(story):
|
|
| 715 |
text = full_md[story['start_char']:story['end_char']]
|
| 716 |
return f"# {story['title']}\n\n**Source**: {book_slug.replace('_', ' ').title()}\n**Pages**: {story['pages']}\n**Keywords**: {story['keywords']}\n\n{text}"
|
| 717 |
|
| 718 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
def gradio_ui():
|
| 720 |
try:
|
| 721 |
logger.info("Building Gradio UI...")
|
|
@@ -731,7 +775,7 @@ def gradio_ui():
|
|
| 731 |
edited_end = gr.State(0)
|
| 732 |
edited_keywords = gr.State("")
|
| 733 |
results_state = gr.State([])
|
| 734 |
-
pending_updates = gr.State({})
|
| 735 |
codex_tree_state = gr.State(load_codex_tree())
|
| 736 |
# For chained dropdowns (shared for assignment and tree nav)
|
| 737 |
cat_path = gr.State([])
|
|
@@ -748,7 +792,6 @@ def gradio_ui():
|
|
| 748 |
search_mode = gr.Dropdown(label="Search Mode", choices=["Keywords (Exact/Phrase Matches)", "Semantic (Conceptual Similarity)", "Both (Hybrid)", "Exact (Word/Phrase)"], value="Both (Hybrid)")
|
| 749 |
min_score_input = gr.Number(label="Min Score Threshold", value=0.1, minimum=0.0, step=0.05)
|
| 750 |
search_btn = gr.Button("Search")
|
| 751 |
-
|
| 752 |
# Main Row: Three Columns
|
| 753 |
with gr.Row(variant="panel"):
|
| 754 |
# Left: Search results
|
|
@@ -761,7 +804,7 @@ def gradio_ui():
|
|
| 761 |
gr.Markdown("### Story Viewer")
|
| 762 |
viewer = gr.HTML(value="Select a story to view...", show_label=False)
|
| 763 |
hidden_page = gr.Textbox(visible=False, show_label=False)
|
| 764 |
-
start_slider = gr.Slider(label="Start Char", minimum=0, maximum=1000000, step=1, interactive=True, visible=False)
|
| 765 |
end_slider = gr.Slider(label="End Char", minimum=0, maximum=1000000, step=1, interactive=True, visible=False)
|
| 766 |
start_phrase = gr.Textbox(label="Start After Phrase (e.g., introduction)", placeholder="Phrase to set start after...")
|
| 767 |
end_phrase = gr.Textbox(label="End Before Phrase (e.g., conclusion)", placeholder="Phrase to set end before...")
|
|
@@ -775,7 +818,6 @@ def gradio_ui():
|
|
| 775 |
level4 = gr.Dropdown(label="Level 4", choices=[], value=None)
|
| 776 |
level5 = gr.Dropdown(label="Level 5", choices=[], value=None)
|
| 777 |
level6 = gr.Dropdown(label="Level 6", choices=[], value=None)
|
| 778 |
-
category_assignment = gr.Textbox(label="Selected Category", value="", interactive=False)
|
| 779 |
reset_btn = gr.Button("Reset Boundaries")
|
| 780 |
edit_status = gr.Textbox(label="Status", value="", interactive=False)
|
| 781 |
add_btn = gr.Button("Add to List")
|
|
@@ -1035,10 +1077,10 @@ def gradio_ui():
|
|
| 1035 |
|
| 1036 |
def assign_category(l1, l2, l3, l4, l5, l6, tree, sel, start, end, kw):
|
| 1037 |
if not sel:
|
| 1038 |
-
return tree
|
| 1039 |
path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
|
| 1040 |
if not path:
|
| 1041 |
-
return tree
|
| 1042 |
story = {
|
| 1043 |
"title": sel['title'],
|
| 1044 |
"book_slug": sel['book_slug'],
|
|
@@ -1049,8 +1091,8 @@ def gradio_ui():
|
|
| 1049 |
}
|
| 1050 |
tree = assign_to_path(tree, path, story)
|
| 1051 |
save_codex_tree(tree)
|
| 1052 |
-
return tree
|
| 1053 |
-
level6.change(assign_category, [level1, level2, level3, level4, level5, level6, codex_tree_state, selected_story, edited_start, edited_end, edited_keywords],
|
| 1054 |
|
| 1055 |
with gr.TabItem("Codex Tree"):
|
| 1056 |
with gr.Row():
|
|
@@ -1064,21 +1106,17 @@ def gradio_ui():
|
|
| 1064 |
tree_level5 = gr.Dropdown(label="Level 5", choices=[], value=None)
|
| 1065 |
tree_level6 = gr.Dropdown(label="Level 6", choices=[], value=None)
|
| 1066 |
tree_stories_radio = gr.Radio(label="Stories", choices=[], interactive=True)
|
| 1067 |
-
tree_path_display = gr.Textbox(label="Current Path", value="", interactive=False)
|
| 1068 |
# Chained updates same as above
|
| 1069 |
-
tree_level1.change(update_level2, tree_level1, tree_level2).then(lambda: gr.update(value=None), tree_level2, tree_level2)
|
| 1070 |
-
tree_level2.change(update_level3, [tree_level1, tree_level2], tree_level3).then(lambda: gr.update(value=None), tree_level3, tree_level3)
|
| 1071 |
-
tree_level3.change(update_level4, [tree_level1, tree_level2, tree_level3], tree_level4).then(lambda: gr.update(value=None), tree_level4, tree_level4)
|
| 1072 |
-
tree_level4.change(update_level5, [tree_level1, tree_level2, tree_level3, tree_level4], tree_level5).then(lambda: gr.update(value=None), tree_level5, tree_level5)
|
| 1073 |
-
tree_level5.change(update_level6, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5], tree_level6).then(lambda: gr.update(value=None), tree_level6, tree_level6)
|
| 1074 |
-
tree_level6.change(lambda l1, l2, l3, l4, l5, l6: path_to_string([l1, l2, l3, l4, l5, l6]), [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6], tree_path_display)
|
| 1075 |
-
|
| 1076 |
def update_tree_stories(l1, l2, l3, l4, l5, l6, tree):
|
| 1077 |
path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
|
| 1078 |
stories = get_stories_at_path(tree, path)
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
return gr.update(choices=[s['title'] for s in stories], value=None, label="Stories")
|
| 1082 |
tree_level6.change(update_tree_stories, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6, codex_tree_state], tree_stories_radio)
|
| 1083 |
|
| 1084 |
download_json_btn = gr.Button("Download codex_tree.json")
|
|
@@ -1094,21 +1132,21 @@ def gradio_ui():
|
|
| 1094 |
view_in_text_btn = gr.Button("View in Text")
|
| 1095 |
|
| 1096 |
# Tree events
|
| 1097 |
-
def select_tree_story(selected_title,
|
| 1098 |
if not selected_title:
|
| 1099 |
return "No story selected.", "Static"
|
| 1100 |
-
path = [p for p in
|
| 1101 |
stories = get_stories_at_path(tree, path)
|
| 1102 |
story = next((s for s in stories if s['title'] == selected_title), None)
|
| 1103 |
if story:
|
| 1104 |
return render_static_story(story), "Static"
|
| 1105 |
return "Story not found.", "Static"
|
| 1106 |
-
tree_stories_radio.change(select_tree_story, [tree_stories_radio,
|
| 1107 |
|
| 1108 |
-
def toggle_view_mode(
|
| 1109 |
-
path = [p for p in
|
| 1110 |
stories = get_stories_at_path(tree, path)
|
| 1111 |
-
story = next((s for s in stories if s['title'] ==
|
| 1112 |
if not story:
|
| 1113 |
return "Story not found.", "Static"
|
| 1114 |
if mode == "Static":
|
|
@@ -1117,7 +1155,7 @@ def gradio_ui():
|
|
| 1117 |
return html, "Book"
|
| 1118 |
else:
|
| 1119 |
return render_static_story(story), "Static"
|
| 1120 |
-
view_in_text_btn.click(toggle_view_mode, [tree_stories_radio, view_mode,
|
| 1121 |
|
| 1122 |
return demo
|
| 1123 |
except Exception as e:
|
|
|
|
| 123 |
retriever_embedding_both = InMemoryEmbeddingRetriever(document_store=document_store)
|
| 124 |
retriever_bm25_both = InMemoryBM25Retriever(document_store=document_store)
|
| 125 |
joiner = DocumentJoiner(
|
| 126 |
+
join_mode="reciprocal_rank_fusion",
|
| 127 |
+
weights=[0.5, 0.5]
|
| 128 |
)
|
| 129 |
# Both (Hybrid) pipeline
|
| 130 |
both_pipeline = Pipeline()
|
|
|
|
| 136 |
both_pipeline.connect("retriever_embedding", "joiner")
|
| 137 |
both_pipeline.connect("retriever_bm25", "joiner")
|
| 138 |
logger.info(f"Both pipeline components: {both_pipeline.graph.nodes.keys()}")
|
| 139 |
+
|
| 140 |
# Components for keyword_pipeline
|
| 141 |
retriever_bm25_key = InMemoryBM25Retriever(document_store=document_store)
|
| 142 |
# Keywords pipeline
|
| 143 |
keyword_pipeline = Pipeline()
|
| 144 |
keyword_pipeline.add_component("retriever_bm25", retriever_bm25_key)
|
| 145 |
logger.info(f"Keyword pipeline components: {keyword_pipeline.graph.nodes.keys()}")
|
| 146 |
+
|
| 147 |
# Components for semantic_pipeline
|
| 148 |
embedder_sem = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5", normalize_embeddings=True)
|
| 149 |
retriever_embedding_sem = InMemoryEmbeddingRetriever(document_store=document_store)
|
|
|
|
| 664 |
api.upload_file(
|
| 665 |
path_or_fileobj=codex_tree_path,
|
| 666 |
path_in_repo=codex_tree_path,
|
| 667 |
+
repo_id="hetzerdj/preternatural-text-ui",
|
| 668 |
+
repo_type="space" # Fixed for Spaces
|
| 669 |
)
|
| 670 |
logger.info("Auto-committed codex_tree.json to HF repo")
|
| 671 |
except Exception as e:
|
|
|
|
| 718 |
text = full_md[story['start_char']:story['end_char']]
|
| 719 |
return f"# {story['title']}\n\n**Source**: {book_slug.replace('_', ' ').title()}\n**Pages**: {story['pages']}\n**Keywords**: {story['keywords']}\n\n{text}"
|
| 720 |
|
| 721 |
+
# Helper: Reset hidden page
|
| 722 |
+
def reset_hidden_page():
|
| 723 |
+
return "0" # Invalid page to trigger change
|
| 724 |
+
|
| 725 |
+
# Helper: Set hidden page
|
| 726 |
+
def set_hidden_page(selected):
|
| 727 |
+
if not selected:
|
| 728 |
+
return "1"
|
| 729 |
+
return selected['pages'].split('-')[0]
|
| 730 |
+
|
| 731 |
+
# Helper: Update pending after changes
|
| 732 |
+
def update_pending_after_changes(pending, selected, new_start, new_end, new_keywords):
|
| 733 |
+
if not selected:
|
| 734 |
+
return pending
|
| 735 |
+
book_slug = selected['book_slug']
|
| 736 |
+
title = selected['title']
|
| 737 |
+
orig_start = selected['start_char']
|
| 738 |
+
orig_end = selected['end_char']
|
| 739 |
+
orig_keywords = selected['keywords']
|
| 740 |
+
update_dict = {}
|
| 741 |
+
changed = False
|
| 742 |
+
if new_start != orig_start:
|
| 743 |
+
changed = True
|
| 744 |
+
update_dict['start_char'] = new_start
|
| 745 |
+
if new_end != orig_end:
|
| 746 |
+
changed = True
|
| 747 |
+
update_dict['end_char'] = new_end
|
| 748 |
+
if new_keywords != orig_keywords:
|
| 749 |
+
changed = True
|
| 750 |
+
kw_list = list(set([k.strip() for k in new_keywords.split(',') if k.strip()]))
|
| 751 |
+
update_dict['keywords'] = kw_list
|
| 752 |
+
if changed:
|
| 753 |
+
if book_slug not in pending:
|
| 754 |
+
pending[book_slug] = {}
|
| 755 |
+
pending[book_slug][title] = update_dict
|
| 756 |
+
elif book_slug in pending and title in pending[book_slug]:
|
| 757 |
+
del pending[book_slug][title]
|
| 758 |
+
if not pending[book_slug]:
|
| 759 |
+
del pending[book_slug]
|
| 760 |
+
return pending
|
| 761 |
+
|
| 762 |
+
# Gradio UI Definition (updated for multi-book: book dropdown, per-book loads)
|
| 763 |
def gradio_ui():
|
| 764 |
try:
|
| 765 |
logger.info("Building Gradio UI...")
|
|
|
|
| 775 |
edited_end = gr.State(0)
|
| 776 |
edited_keywords = gr.State("")
|
| 777 |
results_state = gr.State([])
|
| 778 |
+
pending_updates = gr.State({}) # {book_slug: {title: {'start_char': int, 'end_char': int}}}
|
| 779 |
codex_tree_state = gr.State(load_codex_tree())
|
| 780 |
# For chained dropdowns (shared for assignment and tree nav)
|
| 781 |
cat_path = gr.State([])
|
|
|
|
| 792 |
search_mode = gr.Dropdown(label="Search Mode", choices=["Keywords (Exact/Phrase Matches)", "Semantic (Conceptual Similarity)", "Both (Hybrid)", "Exact (Word/Phrase)"], value="Both (Hybrid)")
|
| 793 |
min_score_input = gr.Number(label="Min Score Threshold", value=0.1, minimum=0.0, step=0.05)
|
| 794 |
search_btn = gr.Button("Search")
|
|
|
|
| 795 |
# Main Row: Three Columns
|
| 796 |
with gr.Row(variant="panel"):
|
| 797 |
# Left: Search results
|
|
|
|
| 804 |
gr.Markdown("### Story Viewer")
|
| 805 |
viewer = gr.HTML(value="Select a story to view...", show_label=False)
|
| 806 |
hidden_page = gr.Textbox(visible=False, show_label=False)
|
| 807 |
+
start_slider = gr.Slider(label="Start Char", minimum=0, maximum=1000000, step=1, interactive=True, visible=False) # Larger max for multi-book
|
| 808 |
end_slider = gr.Slider(label="End Char", minimum=0, maximum=1000000, step=1, interactive=True, visible=False)
|
| 809 |
start_phrase = gr.Textbox(label="Start After Phrase (e.g., introduction)", placeholder="Phrase to set start after...")
|
| 810 |
end_phrase = gr.Textbox(label="End Before Phrase (e.g., conclusion)", placeholder="Phrase to set end before...")
|
|
|
|
| 818 |
level4 = gr.Dropdown(label="Level 4", choices=[], value=None)
|
| 819 |
level5 = gr.Dropdown(label="Level 5", choices=[], value=None)
|
| 820 |
level6 = gr.Dropdown(label="Level 6", choices=[], value=None)
|
|
|
|
| 821 |
reset_btn = gr.Button("Reset Boundaries")
|
| 822 |
edit_status = gr.Textbox(label="Status", value="", interactive=False)
|
| 823 |
add_btn = gr.Button("Add to List")
|
|
|
|
| 1077 |
|
| 1078 |
def assign_category(l1, l2, l3, l4, l5, l6, tree, sel, start, end, kw):
|
| 1079 |
if not sel:
|
| 1080 |
+
return tree
|
| 1081 |
path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
|
| 1082 |
if not path:
|
| 1083 |
+
return tree
|
| 1084 |
story = {
|
| 1085 |
"title": sel['title'],
|
| 1086 |
"book_slug": sel['book_slug'],
|
|
|
|
| 1091 |
}
|
| 1092 |
tree = assign_to_path(tree, path, story)
|
| 1093 |
save_codex_tree(tree)
|
| 1094 |
+
return tree
|
| 1095 |
+
level6.change(assign_category, [level1, level2, level3, level4, level5, level6, codex_tree_state, selected_story, edited_start, edited_end, edited_keywords], codex_tree_state)
|
| 1096 |
|
| 1097 |
with gr.TabItem("Codex Tree"):
|
| 1098 |
with gr.Row():
|
|
|
|
| 1106 |
tree_level5 = gr.Dropdown(label="Level 5", choices=[], value=None)
|
| 1107 |
tree_level6 = gr.Dropdown(label="Level 6", choices=[], value=None)
|
| 1108 |
tree_stories_radio = gr.Radio(label="Stories", choices=[], interactive=True)
|
|
|
|
| 1109 |
# Chained updates same as above
|
| 1110 |
+
tree_level1.change(update_level2, tree_level1, tree_level2).then(lambda: gr.update(value=None), tree_level2, tree_level2)
|
| 1111 |
+
tree_level2.change(update_level3, [tree_level1, tree_level2], tree_level3).then(lambda: gr.update(value=None), tree_level3, tree_level3)
|
| 1112 |
+
tree_level3.change(update_level4, [tree_level1, tree_level2, tree_level3], tree_level4).then(lambda: gr.update(value=None), tree_level4, tree_level4)
|
| 1113 |
+
tree_level4.change(update_level5, [tree_level1, tree_level2, tree_level3, tree_level4], tree_level5).then(lambda: gr.update(value=None), tree_level5, tree_level5)
|
| 1114 |
+
tree_level5.change(update_level6, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5], tree_level6).then(lambda: gr.update(value=None), tree_level6, tree_level6)
|
|
|
|
|
|
|
| 1115 |
def update_tree_stories(l1, l2, l3, l4, l5, l6, tree):
|
| 1116 |
path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
|
| 1117 |
stories = get_stories_at_path(tree, path)
|
| 1118 |
+
choices = [s['title'] for s in stories]
|
| 1119 |
+
return gr.update(choices=choices, value=None)
|
|
|
|
| 1120 |
tree_level6.change(update_tree_stories, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6, codex_tree_state], tree_stories_radio)
|
| 1121 |
|
| 1122 |
download_json_btn = gr.Button("Download codex_tree.json")
|
|
|
|
| 1132 |
view_in_text_btn = gr.Button("View in Text")
|
| 1133 |
|
| 1134 |
# Tree events
|
| 1135 |
+
def select_tree_story(selected_title, l1, l2, l3, l4, l5, l6, tree):
|
| 1136 |
if not selected_title:
|
| 1137 |
return "No story selected.", "Static"
|
| 1138 |
+
path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
|
| 1139 |
stories = get_stories_at_path(tree, path)
|
| 1140 |
story = next((s for s in stories if s['title'] == selected_title), None)
|
| 1141 |
if story:
|
| 1142 |
return render_static_story(story), "Static"
|
| 1143 |
return "Story not found.", "Static"
|
| 1144 |
+
tree_stories_radio.change(select_tree_story, [tree_stories_radio, tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6, codex_tree_state], [tree_viewer, view_mode])
|
| 1145 |
|
| 1146 |
+
def toggle_view_mode(selected_title, mode, l1, l2, l3, l4, l5, l6, tree):
|
| 1147 |
+
path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
|
| 1148 |
stories = get_stories_at_path(tree, path)
|
| 1149 |
+
story = next((s for s in stories if s['title'] == selected_title), None)
|
| 1150 |
if not story:
|
| 1151 |
return "Story not found.", "Static"
|
| 1152 |
if mode == "Static":
|
|
|
|
| 1155 |
return html, "Book"
|
| 1156 |
else:
|
| 1157 |
return render_static_story(story), "Static"
|
| 1158 |
+
view_in_text_btn.click(toggle_view_mode, [tree_stories_radio, view_mode, tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6, codex_tree_state], [tree_viewer, view_mode])
|
| 1159 |
|
| 1160 |
return demo
|
| 1161 |
except Exception as e:
|