hetzerdj commited on
Commit
d7846fc
·
1 Parent(s): 2c5d170

Version 2.0 second run

Browse files
Files changed (1) hide show
  1. app.py +68 -30
app.py CHANGED
@@ -123,8 +123,8 @@ embedder_both = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5",
123
  retriever_embedding_both = InMemoryEmbeddingRetriever(document_store=document_store)
124
  retriever_bm25_both = InMemoryBM25Retriever(document_store=document_store)
125
  joiner = DocumentJoiner(
126
- join_mode="reciprocal_rank_fusion", # UPDATED: Use RRF for better hybrid fusion
127
- weights=[0.5, 0.5] # UPDATED: Equal weights for embedding and BM25; adjust as needed (e.g., [0.6, 0.4] for more semantic bias)
128
  )
129
  # Both (Hybrid) pipeline
130
  both_pipeline = Pipeline()
@@ -136,12 +136,14 @@ both_pipeline.connect("embedder.embedding", "retriever_embedding.query_embedding
136
  both_pipeline.connect("retriever_embedding", "joiner")
137
  both_pipeline.connect("retriever_bm25", "joiner")
138
  logger.info(f"Both pipeline components: {both_pipeline.graph.nodes.keys()}")
 
139
  # Components for keyword_pipeline
140
  retriever_bm25_key = InMemoryBM25Retriever(document_store=document_store)
141
  # Keywords pipeline
142
  keyword_pipeline = Pipeline()
143
  keyword_pipeline.add_component("retriever_bm25", retriever_bm25_key)
144
  logger.info(f"Keyword pipeline components: {keyword_pipeline.graph.nodes.keys()}")
 
145
  # Components for semantic_pipeline
146
  embedder_sem = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5", normalize_embeddings=True)
147
  retriever_embedding_sem = InMemoryEmbeddingRetriever(document_store=document_store)
@@ -662,7 +664,8 @@ def save_codex_tree(tree):
662
  api.upload_file(
663
  path_or_fileobj=codex_tree_path,
664
  path_in_repo=codex_tree_path,
665
- repo_id="hetzerdj/preternatural-text-ui" # Your SPACE_ID
 
666
  )
667
  logger.info("Auto-committed codex_tree.json to HF repo")
668
  except Exception as e:
@@ -715,7 +718,48 @@ def render_static_story(story):
715
  text = full_md[story['start_char']:story['end_char']]
716
  return f"# {story['title']}\n\n**Source**: {book_slug.replace('_', ' ').title()}\n**Pages**: {story['pages']}\n**Keywords**: {story['keywords']}\n\n{text}"
717
 
718
- # Gradio UI Definition
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
  def gradio_ui():
720
  try:
721
  logger.info("Building Gradio UI...")
@@ -731,7 +775,7 @@ def gradio_ui():
731
  edited_end = gr.State(0)
732
  edited_keywords = gr.State("")
733
  results_state = gr.State([])
734
- pending_updates = gr.State({})
735
  codex_tree_state = gr.State(load_codex_tree())
736
  # For chained dropdowns (shared for assignment and tree nav)
737
  cat_path = gr.State([])
@@ -748,7 +792,6 @@ def gradio_ui():
748
  search_mode = gr.Dropdown(label="Search Mode", choices=["Keywords (Exact/Phrase Matches)", "Semantic (Conceptual Similarity)", "Both (Hybrid)", "Exact (Word/Phrase)"], value="Both (Hybrid)")
749
  min_score_input = gr.Number(label="Min Score Threshold", value=0.1, minimum=0.0, step=0.05)
750
  search_btn = gr.Button("Search")
751
-
752
  # Main Row: Three Columns
753
  with gr.Row(variant="panel"):
754
  # Left: Search results
@@ -761,7 +804,7 @@ def gradio_ui():
761
  gr.Markdown("### Story Viewer")
762
  viewer = gr.HTML(value="Select a story to view...", show_label=False)
763
  hidden_page = gr.Textbox(visible=False, show_label=False)
764
- start_slider = gr.Slider(label="Start Char", minimum=0, maximum=1000000, step=1, interactive=True, visible=False)
765
  end_slider = gr.Slider(label="End Char", minimum=0, maximum=1000000, step=1, interactive=True, visible=False)
766
  start_phrase = gr.Textbox(label="Start After Phrase (e.g., introduction)", placeholder="Phrase to set start after...")
767
  end_phrase = gr.Textbox(label="End Before Phrase (e.g., conclusion)", placeholder="Phrase to set end before...")
@@ -775,7 +818,6 @@ def gradio_ui():
775
  level4 = gr.Dropdown(label="Level 4", choices=[], value=None)
776
  level5 = gr.Dropdown(label="Level 5", choices=[], value=None)
777
  level6 = gr.Dropdown(label="Level 6", choices=[], value=None)
778
- category_assignment = gr.Textbox(label="Selected Category", value="", interactive=False)
779
  reset_btn = gr.Button("Reset Boundaries")
780
  edit_status = gr.Textbox(label="Status", value="", interactive=False)
781
  add_btn = gr.Button("Add to List")
@@ -1035,10 +1077,10 @@ def gradio_ui():
1035
 
1036
  def assign_category(l1, l2, l3, l4, l5, l6, tree, sel, start, end, kw):
1037
  if not sel:
1038
- return tree, ""
1039
  path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
1040
  if not path:
1041
- return tree, ""
1042
  story = {
1043
  "title": sel['title'],
1044
  "book_slug": sel['book_slug'],
@@ -1049,8 +1091,8 @@ def gradio_ui():
1049
  }
1050
  tree = assign_to_path(tree, path, story)
1051
  save_codex_tree(tree)
1052
- return tree, path_to_string(path)
1053
- level6.change(assign_category, [level1, level2, level3, level4, level5, level6, codex_tree_state, selected_story, edited_start, edited_end, edited_keywords], [codex_tree_state, category_assignment])
1054
 
1055
  with gr.TabItem("Codex Tree"):
1056
  with gr.Row():
@@ -1064,21 +1106,17 @@ def gradio_ui():
1064
  tree_level5 = gr.Dropdown(label="Level 5", choices=[], value=None)
1065
  tree_level6 = gr.Dropdown(label="Level 6", choices=[], value=None)
1066
  tree_stories_radio = gr.Radio(label="Stories", choices=[], interactive=True)
1067
- tree_path_display = gr.Textbox(label="Current Path", value="", interactive=False)
1068
  # Chained updates same as above
1069
- tree_level1.change(update_level2, tree_level1, tree_level2).then(lambda: gr.update(value=None), tree_level2, tree_level2).then(lambda l1: path_to_string([l1]), tree_level1, tree_path_display)
1070
- tree_level2.change(update_level3, [tree_level1, tree_level2], tree_level3).then(lambda: gr.update(value=None), tree_level3, tree_level3).then(lambda l1, l2: path_to_string([l1, l2]), [tree_level1, tree_level2], tree_path_display)
1071
- tree_level3.change(update_level4, [tree_level1, tree_level2, tree_level3], tree_level4).then(lambda: gr.update(value=None), tree_level4, tree_level4).then(lambda l1, l2, l3: path_to_string([l1, l2, l3]), [tree_level1, tree_level2, tree_level3], tree_path_display)
1072
- tree_level4.change(update_level5, [tree_level1, tree_level2, tree_level3, tree_level4], tree_level5).then(lambda: gr.update(value=None), tree_level5, tree_level5).then(lambda l1, l2, l3, l4: path_to_string([l1, l2, l3, l4]), [tree_level1, tree_level2, tree_level3, tree_level4], tree_path_display)
1073
- tree_level5.change(update_level6, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5], tree_level6).then(lambda: gr.update(value=None), tree_level6, tree_level6).then(lambda l1, l2, l3, l4, l5: path_to_string([l1, l2, l3, l4, l5]), [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5], tree_path_display)
1074
- tree_level6.change(lambda l1, l2, l3, l4, l5, l6: path_to_string([l1, l2, l3, l4, l5, l6]), [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6], tree_path_display)
1075
-
1076
  def update_tree_stories(l1, l2, l3, l4, l5, l6, tree):
1077
  path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
1078
  stories = get_stories_at_path(tree, path)
1079
- if not path or not stories:
1080
- return gr.update(choices=[], value=None, label="Stories (No Stories Yet)")
1081
- return gr.update(choices=[s['title'] for s in stories], value=None, label="Stories")
1082
  tree_level6.change(update_tree_stories, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6, codex_tree_state], tree_stories_radio)
1083
 
1084
  download_json_btn = gr.Button("Download codex_tree.json")
@@ -1094,21 +1132,21 @@ def gradio_ui():
1094
  view_in_text_btn = gr.Button("View in Text")
1095
 
1096
  # Tree events
1097
- def select_tree_story(selected_title, tree_path, tree):
1098
  if not selected_title:
1099
  return "No story selected.", "Static"
1100
- path = [p for p in tree_path if p]
1101
  stories = get_stories_at_path(tree, path)
1102
  story = next((s for s in stories if s['title'] == selected_title), None)
1103
  if story:
1104
  return render_static_story(story), "Static"
1105
  return "Story not found.", "Static"
1106
- tree_stories_radio.change(select_tree_story, [tree_stories_radio, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6], codex_tree_state], [tree_viewer, view_mode])
1107
 
1108
- def toggle_view_mode(story_title, mode, tree_path, tree, hidden_page):
1109
- path = [p for p in tree_path if p]
1110
  stories = get_stories_at_path(tree, path)
1111
- story = next((s for s in stories if s['title'] == story_title), None)
1112
  if not story:
1113
  return "Story not found.", "Static"
1114
  if mode == "Static":
@@ -1117,7 +1155,7 @@ def gradio_ui():
1117
  return html, "Book"
1118
  else:
1119
  return render_static_story(story), "Static"
1120
- view_in_text_btn.click(toggle_view_mode, [tree_stories_radio, view_mode, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6], codex_tree_state, hidden_page], [tree_viewer, view_mode])
1121
 
1122
  return demo
1123
  except Exception as e:
 
123
  retriever_embedding_both = InMemoryEmbeddingRetriever(document_store=document_store)
124
  retriever_bm25_both = InMemoryBM25Retriever(document_store=document_store)
125
  joiner = DocumentJoiner(
126
+ join_mode="reciprocal_rank_fusion",
127
+ weights=[0.5, 0.5]
128
  )
129
  # Both (Hybrid) pipeline
130
  both_pipeline = Pipeline()
 
136
  both_pipeline.connect("retriever_embedding", "joiner")
137
  both_pipeline.connect("retriever_bm25", "joiner")
138
  logger.info(f"Both pipeline components: {both_pipeline.graph.nodes.keys()}")
139
+
140
  # Components for keyword_pipeline
141
  retriever_bm25_key = InMemoryBM25Retriever(document_store=document_store)
142
  # Keywords pipeline
143
  keyword_pipeline = Pipeline()
144
  keyword_pipeline.add_component("retriever_bm25", retriever_bm25_key)
145
  logger.info(f"Keyword pipeline components: {keyword_pipeline.graph.nodes.keys()}")
146
+
147
  # Components for semantic_pipeline
148
  embedder_sem = SentenceTransformersTextEmbedder(model="BAAI/bge-large-en-v1.5", normalize_embeddings=True)
149
  retriever_embedding_sem = InMemoryEmbeddingRetriever(document_store=document_store)
 
664
  api.upload_file(
665
  path_or_fileobj=codex_tree_path,
666
  path_in_repo=codex_tree_path,
667
+ repo_id="hetzerdj/preternatural-text-ui",
668
+ repo_type="space" # Fixed for Spaces
669
  )
670
  logger.info("Auto-committed codex_tree.json to HF repo")
671
  except Exception as e:
 
718
  text = full_md[story['start_char']:story['end_char']]
719
  return f"# {story['title']}\n\n**Source**: {book_slug.replace('_', ' ').title()}\n**Pages**: {story['pages']}\n**Keywords**: {story['keywords']}\n\n{text}"
720
 
721
+ # Helper: Reset hidden page
722
+ def reset_hidden_page():
723
+ return "0" # Invalid page to trigger change
724
+
725
+ # Helper: Set hidden page
726
+ def set_hidden_page(selected):
727
+ if not selected:
728
+ return "1"
729
+ return selected['pages'].split('-')[0]
730
+
731
+ # Helper: Update pending after changes
732
+ def update_pending_after_changes(pending, selected, new_start, new_end, new_keywords):
733
+ if not selected:
734
+ return pending
735
+ book_slug = selected['book_slug']
736
+ title = selected['title']
737
+ orig_start = selected['start_char']
738
+ orig_end = selected['end_char']
739
+ orig_keywords = selected['keywords']
740
+ update_dict = {}
741
+ changed = False
742
+ if new_start != orig_start:
743
+ changed = True
744
+ update_dict['start_char'] = new_start
745
+ if new_end != orig_end:
746
+ changed = True
747
+ update_dict['end_char'] = new_end
748
+ if new_keywords != orig_keywords:
749
+ changed = True
750
+ kw_list = list(set([k.strip() for k in new_keywords.split(',') if k.strip()]))
751
+ update_dict['keywords'] = kw_list
752
+ if changed:
753
+ if book_slug not in pending:
754
+ pending[book_slug] = {}
755
+ pending[book_slug][title] = update_dict
756
+ elif book_slug in pending and title in pending[book_slug]:
757
+ del pending[book_slug][title]
758
+ if not pending[book_slug]:
759
+ del pending[book_slug]
760
+ return pending
761
+
762
+ # Gradio UI Definition (updated for multi-book: book dropdown, per-book loads)
763
  def gradio_ui():
764
  try:
765
  logger.info("Building Gradio UI...")
 
775
  edited_end = gr.State(0)
776
  edited_keywords = gr.State("")
777
  results_state = gr.State([])
778
+ pending_updates = gr.State({}) # {book_slug: {title: {'start_char': int, 'end_char': int}}}
779
  codex_tree_state = gr.State(load_codex_tree())
780
  # For chained dropdowns (shared for assignment and tree nav)
781
  cat_path = gr.State([])
 
792
  search_mode = gr.Dropdown(label="Search Mode", choices=["Keywords (Exact/Phrase Matches)", "Semantic (Conceptual Similarity)", "Both (Hybrid)", "Exact (Word/Phrase)"], value="Both (Hybrid)")
793
  min_score_input = gr.Number(label="Min Score Threshold", value=0.1, minimum=0.0, step=0.05)
794
  search_btn = gr.Button("Search")
 
795
  # Main Row: Three Columns
796
  with gr.Row(variant="panel"):
797
  # Left: Search results
 
804
  gr.Markdown("### Story Viewer")
805
  viewer = gr.HTML(value="Select a story to view...", show_label=False)
806
  hidden_page = gr.Textbox(visible=False, show_label=False)
807
+ start_slider = gr.Slider(label="Start Char", minimum=0, maximum=1000000, step=1, interactive=True, visible=False) # Larger max for multi-book
808
  end_slider = gr.Slider(label="End Char", minimum=0, maximum=1000000, step=1, interactive=True, visible=False)
809
  start_phrase = gr.Textbox(label="Start After Phrase (e.g., introduction)", placeholder="Phrase to set start after...")
810
  end_phrase = gr.Textbox(label="End Before Phrase (e.g., conclusion)", placeholder="Phrase to set end before...")
 
818
  level4 = gr.Dropdown(label="Level 4", choices=[], value=None)
819
  level5 = gr.Dropdown(label="Level 5", choices=[], value=None)
820
  level6 = gr.Dropdown(label="Level 6", choices=[], value=None)
 
821
  reset_btn = gr.Button("Reset Boundaries")
822
  edit_status = gr.Textbox(label="Status", value="", interactive=False)
823
  add_btn = gr.Button("Add to List")
 
1077
 
1078
  def assign_category(l1, l2, l3, l4, l5, l6, tree, sel, start, end, kw):
1079
  if not sel:
1080
+ return tree
1081
  path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
1082
  if not path:
1083
+ return tree
1084
  story = {
1085
  "title": sel['title'],
1086
  "book_slug": sel['book_slug'],
 
1091
  }
1092
  tree = assign_to_path(tree, path, story)
1093
  save_codex_tree(tree)
1094
+ return tree
1095
+ level6.change(assign_category, [level1, level2, level3, level4, level5, level6, codex_tree_state, selected_story, edited_start, edited_end, edited_keywords], codex_tree_state)
1096
 
1097
  with gr.TabItem("Codex Tree"):
1098
  with gr.Row():
 
1106
  tree_level5 = gr.Dropdown(label="Level 5", choices=[], value=None)
1107
  tree_level6 = gr.Dropdown(label="Level 6", choices=[], value=None)
1108
  tree_stories_radio = gr.Radio(label="Stories", choices=[], interactive=True)
 
1109
  # Chained updates same as above
1110
+ tree_level1.change(update_level2, tree_level1, tree_level2).then(lambda: gr.update(value=None), tree_level2, tree_level2)
1111
+ tree_level2.change(update_level3, [tree_level1, tree_level2], tree_level3).then(lambda: gr.update(value=None), tree_level3, tree_level3)
1112
+ tree_level3.change(update_level4, [tree_level1, tree_level2, tree_level3], tree_level4).then(lambda: gr.update(value=None), tree_level4, tree_level4)
1113
+ tree_level4.change(update_level5, [tree_level1, tree_level2, tree_level3, tree_level4], tree_level5).then(lambda: gr.update(value=None), tree_level5, tree_level5)
1114
+ tree_level5.change(update_level6, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5], tree_level6).then(lambda: gr.update(value=None), tree_level6, tree_level6)
 
 
1115
  def update_tree_stories(l1, l2, l3, l4, l5, l6, tree):
1116
  path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
1117
  stories = get_stories_at_path(tree, path)
1118
+ choices = [s['title'] for s in stories]
1119
+ return gr.update(choices=choices, value=None)
 
1120
  tree_level6.change(update_tree_stories, [tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6, codex_tree_state], tree_stories_radio)
1121
 
1122
  download_json_btn = gr.Button("Download codex_tree.json")
 
1132
  view_in_text_btn = gr.Button("View in Text")
1133
 
1134
  # Tree events
1135
+ def select_tree_story(selected_title, l1, l2, l3, l4, l5, l6, tree):
1136
  if not selected_title:
1137
  return "No story selected.", "Static"
1138
+ path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
1139
  stories = get_stories_at_path(tree, path)
1140
  story = next((s for s in stories if s['title'] == selected_title), None)
1141
  if story:
1142
  return render_static_story(story), "Static"
1143
  return "Story not found.", "Static"
1144
+ tree_stories_radio.change(select_tree_story, [tree_stories_radio, tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6, codex_tree_state], [tree_viewer, view_mode])
1145
 
1146
+ def toggle_view_mode(selected_title, mode, l1, l2, l3, l4, l5, l6, tree):
1147
+ path = [p for p in [l1, l2, l3, l4, l5, l6] if p]
1148
  stories = get_stories_at_path(tree, path)
1149
+ story = next((s for s in stories if s['title'] == selected_title), None)
1150
  if not story:
1151
  return "Story not found.", "Static"
1152
  if mode == "Static":
 
1155
  return html, "Book"
1156
  else:
1157
  return render_static_story(story), "Static"
1158
+ view_in_text_btn.click(toggle_view_mode, [tree_stories_radio, view_mode, tree_level1, tree_level2, tree_level3, tree_level4, tree_level5, tree_level6, codex_tree_state], [tree_viewer, view_mode])
1159
 
1160
  return demo
1161
  except Exception as e: