diff --git a/README.md b/README.md index 9566a94f23d3480d6c9f80e720ebc6aad5c49d59..fea96e99bb2ff0d53030a03b57a567d4927fc970 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ --- -title: GenDoc -emoji: 🐨 +title: FormatDoc +emoji: 👁 colorFrom: blue -colorTo: green +colorTo: red sdk: gradio -sdk_version: 3.42.0 +sdk_version: 3.34.0 app_file: app.py pinned: false -license: openrail +license: eupl-1.1 --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..38a640db751dfefc879843f201c334aa119fb99f --- /dev/null +++ b/app.py @@ -0,0 +1,19 @@ +import os +from langchain.llms import OpenAI +from config import config +from src.control.control import Controller +import src.view.view as view + +os.environ["TOKENIZERS_PARALLELISM"] = "true" + +if not "OPENAI_API_KEY" in os.environ: + from config_key import OPENAI_API_KEY + + os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY + +llm_model = OpenAI(temperature=0) + +ctrl = Controller(config) +app = view.run(controller=ctrl, config=config) + +app.queue().launch() diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..fb8ad8c3a7972bca34d3f7d720dc346e0675a5ac --- /dev/null +++ b/config.py @@ -0,0 +1,29 @@ +import os + +config = { + 'templates_path': 'data/templates', + 'these_docs_path': 'data/examples/', + 'new_docs_path': 'data/examples/', + 'default_template_index': 0, + 'styled_docs_path': 'temp/styles_files', + 'generated_docs_path': 'temp/generated_files', + 'options': ["Ajouter les sections du début", "Recentrer les tableaux", "Recentrer les images", + "Redimensionner les tableaux", "Redimensionner les images", + "Vérifier la cohérence des langues"], + 'max_styles': 300, + 'log_msg': { + 'suppressed_styles': 'Les styles suivants ont été supprimés : \n', + 'modified_styles': 'Les styles suivants ont été modifiés : \n', + 'added_styles': 'Les styles suivants ont été ajoutés :\n', + 'modified_style': ' - ', + 'color': ' la couleur,', + 'font size': ' la taille de la fonte,', + 'font': ' la fonte,', + 'all_caps': ' les majuscules,', + 'bold': 'le caractère gras', + 'document': '\n============================\n Sur le document : ', + }, +} + +templates = [t for t in os.listdir(config['templates_path']) if t.endswith((".docx",))] +config.update({'templates': templates}) diff --git a/config_key.py b/config_key.py new file mode 100644 index 0000000000000000000000000000000000000000..16b46eab50b71494f961ac108856fea73ccf8ff7 --- /dev/null +++ b/config_key.py @@ -0,0 +1 @@ +OPENAI_API_KEY = "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" diff --git a/data/.DS_Store b/data/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..823c93413ad551605f396eb17c26c99d15f0f7f2 Binary files /dev/null and b/data/.DS_Store differ diff --git a/data/doc.xml b/data/doc.xml new file mode 100644 index 0000000000000000000000000000000000000000..162764c458598c24e073f5292bb8d20657c12c04 --- /dev/null +++ b/data/doc.xml @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + Réponse à + + + + + + + + + + Nom du Client + + + + + + + + + + + + pour leStyle pour cette page seulement (non recopié en en-tête)Nom du projetDate de remiseJJ/MM/AAAAStrictement confidentielrightbottom00clause de confidentialitéToute information contenue dans ce document strictement confidentiel est fournie à STYLEREF .CompanyName \\* MERGEFORMAT Nom du Client dans le seul but de répondre à ses demandes et ne peut être utilisée à d’autres fins. STYLEREF .CompanyName \\* MERGEFORMAT Nom du Client s’engage à ne pas publier ni faire connaître tout ou partie de ces informations à quelque tierce partie que ce soit sans l’autorisation préalable d’Orange.© copyright 2018Tous droits réservésvotre contactNom :Titre :Email :@orange.comTél :Mobile :Adresse :Site Web :http://www.orange-business.comTable des matières TOC \\o "1-3" \\h \\z \\u Aucune entrée de table des matières n\'a été trouvée.Liste des tableaux TOC \\h \\z \\c "Tableau" Aucune entrée de table d\'illustration n\'a été trouvée.Liste des figures TOC \\h \\z \\c "Figure" Aucune entrée de table d\'illustration n\'a été trouvée.CccQsddDsbvbvnFezjfzJzekkfjk Nf nvf z,v$ \ No newline at end of file diff --git a/data/examples/.DS_Store b/data/examples/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/data/examples/.DS_Store differ diff --git a/data/examples/AldoMoro.docx b/data/examples/AldoMoro.docx new file mode 100644 index 0000000000000000000000000000000000000000..ceef84680b54bf33053892157530b58487042635 Binary files /dev/null and b/data/examples/AldoMoro.docx differ diff --git a/data/examples/Aldo_Moro_simple.docx b/data/examples/Aldo_Moro_simple.docx new file mode 100644 index 0000000000000000000000000000000000000000..fda7cf9bc2fb202cb1fa5d6b7b37540384f6f7eb Binary files /dev/null and b/data/examples/Aldo_Moro_simple.docx differ diff --git a/data/examples/Aldo_Moro_simple_rouge.docx b/data/examples/Aldo_Moro_simple_rouge.docx new file mode 100644 index 0000000000000000000000000000000000000000..84d686410baefb07c465378c3dbb707d7d60f1d4 Binary files /dev/null and b/data/examples/Aldo_Moro_simple_rouge.docx differ diff --git a/data/examples/Aldo_Moro_simple_style.docx b/data/examples/Aldo_Moro_simple_style.docx new file mode 100644 index 0000000000000000000000000000000000000000..a763466ba6b53e84de1f076092f4ebdd66e6cc53 Binary files /dev/null and b/data/examples/Aldo_Moro_simple_style.docx differ diff --git a/data/examples/Aldo_Moro_simple_vert.docx b/data/examples/Aldo_Moro_simple_vert.docx new file mode 100644 index 0000000000000000000000000000000000000000..726e56b9a747efca12e490c1b0af329a35954085 Binary files /dev/null and b/data/examples/Aldo_Moro_simple_vert.docx differ diff --git a/data/examples/CorpTemplate.docx b/data/examples/CorpTemplate.docx new file mode 100644 index 0000000000000000000000000000000000000000..8dd942a03290cfa719726d6891a1cefe55554111 Binary files /dev/null and b/data/examples/CorpTemplate.docx differ diff --git a/data/examples/CorpTemplate_.docx b/data/examples/CorpTemplate_.docx new file mode 100644 index 0000000000000000000000000000000000000000..3506e2577b9a86f84e93c488f982a07abd4547e5 Binary files /dev/null and b/data/examples/CorpTemplate_.docx differ diff --git a/data/examples/[Content_Types].xml b/data/examples/[Content_Types].xml new file mode 100644 index 0000000000000000000000000000000000000000..4b35629217f39629a1d0880be54361e325debc0a --- /dev/null +++ b/data/examples/[Content_Types].xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/_rels/.rels b/data/examples/_rels/.rels new file mode 100644 index 0000000000000000000000000000000000000000..33f70fb26e29067a20c2b4489c1da313dffcab67 --- /dev/null +++ b/data/examples/_rels/.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/[Content_Types].xml b/data/examples/corpd/[Content_Types].xml new file mode 100644 index 0000000000000000000000000000000000000000..c8c6a1c8b13f60d272cd939912d30f59a5f84cb1 --- /dev/null +++ b/data/examples/corpd/[Content_Types].xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/_rels/.rels b/data/examples/corpd/_rels/.rels new file mode 100644 index 0000000000000000000000000000000000000000..57be32812b37eeedd2ff3322519e4fbf716a6003 --- /dev/null +++ b/data/examples/corpd/_rels/.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/_rels/item1.xml.rels b/data/examples/corpd/customXml/_rels/item1.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..b4bc8d63c3905c71ab20bc61e63546740e2710a6 --- /dev/null +++ b/data/examples/corpd/customXml/_rels/item1.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/_rels/item2.xml.rels b/data/examples/corpd/customXml/_rels/item2.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..49b7c9474cb432ed52a61f222cb648ca93e85289 --- /dev/null +++ b/data/examples/corpd/customXml/_rels/item2.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/_rels/item3.xml.rels b/data/examples/corpd/customXml/_rels/item3.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..0bdc1ef975e759bdfe68409d4bb27734233257e4 --- /dev/null +++ b/data/examples/corpd/customXml/_rels/item3.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/_rels/item4.xml.rels b/data/examples/corpd/customXml/_rels/item4.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..cdb040627942f2f253904511b4ca7e3748e0ca9d --- /dev/null +++ b/data/examples/corpd/customXml/_rels/item4.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/item1.xml b/data/examples/corpd/customXml/item1.xml new file mode 100644 index 0000000000000000000000000000000000000000..607faca2f583f4a566c4b899e71705ed41870c3e --- /dev/null +++ b/data/examples/corpd/customXml/item1.xml @@ -0,0 +1 @@ +DocumentLibraryFormDocumentLibraryFormDocumentLibraryForm \ No newline at end of file diff --git a/data/examples/corpd/customXml/item2.xml b/data/examples/corpd/customXml/item2.xml new file mode 100644 index 0000000000000000000000000000000000000000..8c4dde85efd7fc64dd45da7a8e0a5566af41ed58 --- /dev/null +++ b/data/examples/corpd/customXml/item2.xml @@ -0,0 +1,111 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This value indicates the number of saves or revisions. The application is responsible for updating this value after each revision. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/item3.xml b/data/examples/corpd/customXml/item3.xml new file mode 100644 index 0000000000000000000000000000000000000000..c8543fbf7d8eba5d1a7a3bde3cca4349afb73687 --- /dev/null +++ b/data/examples/corpd/customXml/item3.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/data/examples/corpd/customXml/item4.xml b/data/examples/corpd/customXml/item4.xml new file mode 100644 index 0000000000000000000000000000000000000000..a2f9e90374eb0c0c98a6fe3a256800eb96fb3e43 --- /dev/null +++ b/data/examples/corpd/customXml/item4.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/data/examples/corpd/customXml/itemProps1.xml b/data/examples/corpd/customXml/itemProps1.xml new file mode 100644 index 0000000000000000000000000000000000000000..e9a990bc2360b07c29b63ab079470dec8529e82a --- /dev/null +++ b/data/examples/corpd/customXml/itemProps1.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/itemProps2.xml b/data/examples/corpd/customXml/itemProps2.xml new file mode 100644 index 0000000000000000000000000000000000000000..b446d876dab26363da2a8e91c272bd0f0e3bcfb6 --- /dev/null +++ b/data/examples/corpd/customXml/itemProps2.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/itemProps3.xml b/data/examples/corpd/customXml/itemProps3.xml new file mode 100644 index 0000000000000000000000000000000000000000..c0201c270758a974dc5f8cdae2be3cf635d413ce --- /dev/null +++ b/data/examples/corpd/customXml/itemProps3.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/customXml/itemProps4.xml b/data/examples/corpd/customXml/itemProps4.xml new file mode 100644 index 0000000000000000000000000000000000000000..b94b08d6cc5c54037c32433394a75534c20764d7 --- /dev/null +++ b/data/examples/corpd/customXml/itemProps4.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/docProps/app.xml b/data/examples/corpd/docProps/app.xml new file mode 100644 index 0000000000000000000000000000000000000000..c464eef06ecbd203a18cf34dcf59fb2ea79151b6 --- /dev/null +++ b/data/examples/corpd/docProps/app.xml @@ -0,0 +1,2 @@ + +15175967Microsoft Office Word082falseTitre1Title1ORANGE FT Groupfalse1140falsefalse16.0000 \ No newline at end of file diff --git a/data/examples/corpd/docProps/core.xml b/data/examples/corpd/docProps/core.xml new file mode 100644 index 0000000000000000000000000000000000000000..c0f78d86a0ab2522083d0884a8a954cdb6021325 --- /dev/null +++ b/data/examples/corpd/docProps/core.xml @@ -0,0 +1,2 @@ + +Microsoft Office Userlaura peligry22023-07-07T10:21:00Z2023-07-07T10:21:00Z \ No newline at end of file diff --git a/data/examples/corpd/docProps/custom.xml b/data/examples/corpd/docProps/custom.xml new file mode 100644 index 0000000000000000000000000000000000000000..62a06e8d2f7d68b0eb35cb9f6e591b1bb63c0708 --- /dev/null +++ b/data/examples/corpd/docProps/custom.xml @@ -0,0 +1,2 @@ + +0x0101000D129E6A83B3234C936C6D85FE6AF210 \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/document.xml.rels b/data/examples/corpd/word/_rels/document.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..e52e333ef4c1119b2c73745f09496bfb04912c66 --- /dev/null +++ b/data/examples/corpd/word/_rels/document.xml.rels @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/header1.xml.rels b/data/examples/corpd/word/_rels/header1.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..27811d643015603616c33c50d05f8237ae83426d --- /dev/null +++ b/data/examples/corpd/word/_rels/header1.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/header2.xml.rels b/data/examples/corpd/word/_rels/header2.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..408fe050e0eb63404b52fb4734985f883167c30e --- /dev/null +++ b/data/examples/corpd/word/_rels/header2.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/header3.xml.rels b/data/examples/corpd/word/_rels/header3.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..408fe050e0eb63404b52fb4734985f883167c30e --- /dev/null +++ b/data/examples/corpd/word/_rels/header3.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/header4.xml.rels b/data/examples/corpd/word/_rels/header4.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..2c31216f150550a59258ca158a0e89b83a0925d4 --- /dev/null +++ b/data/examples/corpd/word/_rels/header4.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/header5.xml.rels b/data/examples/corpd/word/_rels/header5.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..408fe050e0eb63404b52fb4734985f883167c30e --- /dev/null +++ b/data/examples/corpd/word/_rels/header5.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/header6.xml.rels b/data/examples/corpd/word/_rels/header6.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..5820cdd45108ec7557e5ded551273cbac0220891 --- /dev/null +++ b/data/examples/corpd/word/_rels/header6.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/header7.xml.rels b/data/examples/corpd/word/_rels/header7.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..408fe050e0eb63404b52fb4734985f883167c30e --- /dev/null +++ b/data/examples/corpd/word/_rels/header7.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/_rels/settings.xml.rels b/data/examples/corpd/word/_rels/settings.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..93568428252d86b8394e0e6c22596ac2185dafd4 --- /dev/null +++ b/data/examples/corpd/word/_rels/settings.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/document.xml b/data/examples/corpd/word/document.xml new file mode 100644 index 0000000000000000000000000000000000000000..049d987d5919d53eabd0af6ba63059cc96ffd179 --- /dev/null +++ b/data/examples/corpd/word/document.xml @@ -0,0 +1,2 @@ + +Réponse àNom du Clientpour leStyle pour cette page seulement (non recopié en en-tête)Nom du projetDate de remiseJJ/MM/AAAAStrictement confidentielrightbottom00clause de confidentialitéToute information contenue dans ce document strictement confidentiel est fournie à STYLEREF .CompanyName \* MERGEFORMAT Nom du Client dans le seul but de répondre à ses demandes et ne peut être utilisée à d’autres fins. STYLEREF .CompanyName \* MERGEFORMAT Nom du Client s’engage à ne pas publier ni faire connaître tout ou partie de ces informations à quelque tierce partie que ce soit sans l’autorisation préalable d’Orange.© copyright 2018Tous droits réservésvotre contactNom :Titre :Email :@orange.comTél :Mobile :Adresse :Site Web :http://www.orange-business.comTable des matières TOC \o "1-3" \h \z \u Aucune entrée de table des matières n'a été trouvée.Liste des tableaux TOC \h \z \c "Tableau" Aucune entrée de table d'illustration n'a été trouvée.Liste des figures TOC \h \z \c "Figure" Aucune entrée de table d'illustration n'a été trouvée.CccQsddDsbvbvnFezjfzJzekkfjk Nf nvf z,v$ \ No newline at end of file diff --git a/data/examples/corpd/word/endnotes.xml b/data/examples/corpd/word/endnotes.xml new file mode 100644 index 0000000000000000000000000000000000000000..de29e959874fa3f20127ac60e15ea4da0c178d12 --- /dev/null +++ b/data/examples/corpd/word/endnotes.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/fontTable.xml b/data/examples/corpd/word/fontTable.xml new file mode 100644 index 0000000000000000000000000000000000000000..46f12af8b92908f815ce495477d16a7391c1fc04 --- /dev/null +++ b/data/examples/corpd/word/fontTable.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/footer1.xml b/data/examples/corpd/word/footer1.xml new file mode 100644 index 0000000000000000000000000000000000000000..a52f7aea4b378fc24a99a6b1730b451ccdd2c7e4 --- /dev/null +++ b/data/examples/corpd/word/footer1.xml @@ -0,0 +1,2 @@ + +Clause de confidentialitéPage PAGE \* Arabic \* MERGEFORMAT 2 sur 4 STYLEREF .Classification \* MERGEFORMAT Strictement confidentiel STYLEREF .DateDue \* MERGEFORMAT JJ/MM/AAAA \ No newline at end of file diff --git a/data/examples/corpd/word/footer2.xml b/data/examples/corpd/word/footer2.xml new file mode 100644 index 0000000000000000000000000000000000000000..f4d1bd44b49157c9f44a8ee2b021fe7755c8cc12 --- /dev/null +++ b/data/examples/corpd/word/footer2.xml @@ -0,0 +1,2 @@ + +Clause de confidentialitéPage PAGE \* Arabic \* MERGEFORMAT 2 sur 5 STYLEREF .Classification \* MERGEFORMAT Strictement confidentielJJ/MM/AAAA \ No newline at end of file diff --git a/data/examples/corpd/word/footer3.xml b/data/examples/corpd/word/footer3.xml new file mode 100644 index 0000000000000000000000000000000000000000..887fd4a417c5d5b02e3a9a9595998a468535552b --- /dev/null +++ b/data/examples/corpd/word/footer3.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/footer4.xml b/data/examples/corpd/word/footer4.xml new file mode 100644 index 0000000000000000000000000000000000000000..b42804bf24934079b10fd432bffb3b77a6b1bbc2 --- /dev/null +++ b/data/examples/corpd/word/footer4.xml @@ -0,0 +1,2 @@ + +Chapitre STYLEREF 1 \n Erreur ! Il n'y a pas de texte répondant à ce style dans ce document.: STYLEREF 1 Erreur ! Il n'y a pas de texte répondant à ce style dans ce document.Page PAGE \* Arabic \* MERGEFORMAT 4 sur NUMPAGES \* Arabic \* MERGEFORMAT 3 STYLEREF .Classification \* MERGEFORMAT Strictement confidentiel STYLEREF .DateDue \* MERGEFORMAT JJ/MM/AAAA \ No newline at end of file diff --git a/data/examples/corpd/word/footer5.xml b/data/examples/corpd/word/footer5.xml new file mode 100644 index 0000000000000000000000000000000000000000..da20c06ed7aeb83a691f78691ae9204382f35b7b --- /dev/null +++ b/data/examples/corpd/word/footer5.xml @@ -0,0 +1,2 @@ + +Table des matièresPage PAGE \* Arabic \* MERGEFORMAT 3 sur 4 STYLEREF .Classification \* MERGEFORMAT Strictement confidentielJJ/MM/AAAA \ No newline at end of file diff --git a/data/examples/corpd/word/footer6.xml b/data/examples/corpd/word/footer6.xml new file mode 100644 index 0000000000000000000000000000000000000000..d1a3bb3e0c254bbd0fbf24e0692526e7d90657bf --- /dev/null +++ b/data/examples/corpd/word/footer6.xml @@ -0,0 +1,2 @@ + +Table des matières STYLEREF .DateDue \* MERGEFORMAT JJ/MM/AAAA STYLEREF .Classification \* MERGEFORMAT Strictement confidentielpage PAGE \* Arabic \* MERGEFORMAT 3 sur NUMPAGES \* Arabic \* MERGEFORMAT 3 \ No newline at end of file diff --git a/data/examples/corpd/word/footer7.xml b/data/examples/corpd/word/footer7.xml new file mode 100644 index 0000000000000000000000000000000000000000..3d046deb495185a99d75224be926527984344a16 --- /dev/null +++ b/data/examples/corpd/word/footer7.xml @@ -0,0 +1,2 @@ + +Chapitre STYLEREF .Titre1 \w 1.  : STYLEREF .Titre1 CccPage PAGE \* Arabic \* MERGEFORMAT 4 sur 4 STYLEREF .Classification \* MERGEFORMAT Strictement confidentielJJ/MM/AAAA \ No newline at end of file diff --git a/data/examples/corpd/word/footnotes.xml b/data/examples/corpd/word/footnotes.xml new file mode 100644 index 0000000000000000000000000000000000000000..e45e1072af4035763047e13be101f4d63aabd2bb --- /dev/null +++ b/data/examples/corpd/word/footnotes.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/header1.xml b/data/examples/corpd/word/header1.xml new file mode 100644 index 0000000000000000000000000000000000000000..020554a46f1e20e5171820586ef94e8ba188a8cc --- /dev/null +++ b/data/examples/corpd/word/header1.xml @@ -0,0 +1,2 @@ + + STYLEREF .CompanyName Nom du Client STYLEREF .ProjectName Nom du projet \ No newline at end of file diff --git a/data/examples/corpd/word/header2.xml b/data/examples/corpd/word/header2.xml new file mode 100644 index 0000000000000000000000000000000000000000..84fe067f935e87297de9cfdcb6e6820516c9688d --- /dev/null +++ b/data/examples/corpd/word/header2.xml @@ -0,0 +1,2 @@ + +28575028575000 STYLEREF .CompanyName Nom du Client STYLEREF .ProjectName Nom du projet \ No newline at end of file diff --git a/data/examples/corpd/word/header3.xml b/data/examples/corpd/word/header3.xml new file mode 100644 index 0000000000000000000000000000000000000000..e283ea0005600736360b74bf1bdc8c0f8f522962 --- /dev/null +++ b/data/examples/corpd/word/header3.xml @@ -0,0 +1,2 @@ + +285750285750 \ No newline at end of file diff --git a/data/examples/corpd/word/header4.xml b/data/examples/corpd/word/header4.xml new file mode 100644 index 0000000000000000000000000000000000000000..329f518088ebd93e495940b23b3f780562bf44fa --- /dev/null +++ b/data/examples/corpd/word/header4.xml @@ -0,0 +1,2 @@ + + STYLEREF .CompanyName Nom du Client STYLEREF .ProjectName Nom du projet \ No newline at end of file diff --git a/data/examples/corpd/word/header5.xml b/data/examples/corpd/word/header5.xml new file mode 100644 index 0000000000000000000000000000000000000000..0483f3ed332a3f7ce9b5683e2c6b856059033e38 --- /dev/null +++ b/data/examples/corpd/word/header5.xml @@ -0,0 +1,2 @@ + +28575028575000 STYLEREF .CompanyName Nom du Client STYLEREF .ProjectName Nom du projet \ No newline at end of file diff --git a/data/examples/corpd/word/header6.xml b/data/examples/corpd/word/header6.xml new file mode 100644 index 0000000000000000000000000000000000000000..5e0e63b4d37d5f021eea10d6ef84ba682766ee67 --- /dev/null +++ b/data/examples/corpd/word/header6.xml @@ -0,0 +1,2 @@ + + STYLEREF .CompanyName Nom du Client STYLEREF .ProjectName Nom du projet \ No newline at end of file diff --git a/data/examples/corpd/word/header7.xml b/data/examples/corpd/word/header7.xml new file mode 100644 index 0000000000000000000000000000000000000000..80ef2851a61aa928508cad05479088d9098ce6ea --- /dev/null +++ b/data/examples/corpd/word/header7.xml @@ -0,0 +1,2 @@ + +28575028575000 STYLEREF .CompanyName Nom du Client STYLEREF .ProjectName Nom du projet \ No newline at end of file diff --git a/data/examples/corpd/word/media/image1.jpg b/data/examples/corpd/word/media/image1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cdd57f3616f13b7f5a64995262ba99f99c178e70 Binary files /dev/null and b/data/examples/corpd/word/media/image1.jpg differ diff --git a/data/examples/corpd/word/media/image2.png b/data/examples/corpd/word/media/image2.png new file mode 100644 index 0000000000000000000000000000000000000000..8173ac6213b34cc908683901a428125d96b6fa81 Binary files /dev/null and b/data/examples/corpd/word/media/image2.png differ diff --git a/data/examples/corpd/word/media/image3.png b/data/examples/corpd/word/media/image3.png new file mode 100644 index 0000000000000000000000000000000000000000..89970e258792fb4107578076a63f9648791a5b04 Binary files /dev/null and b/data/examples/corpd/word/media/image3.png differ diff --git a/data/examples/corpd/word/media/image4.png b/data/examples/corpd/word/media/image4.png new file mode 100644 index 0000000000000000000000000000000000000000..16274e32eff500fd931f03b60f23b0d4e59f3a8a Binary files /dev/null and b/data/examples/corpd/word/media/image4.png differ diff --git a/data/examples/corpd/word/media/image5.jpeg b/data/examples/corpd/word/media/image5.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..68eae97b8f1dce5907d11dab74b37d4d8084286f Binary files /dev/null and b/data/examples/corpd/word/media/image5.jpeg differ diff --git a/data/examples/corpd/word/numbering.xml b/data/examples/corpd/word/numbering.xml new file mode 100644 index 0000000000000000000000000000000000000000..a7962290212b1c9af955c439296cfe05d621b24c --- /dev/null +++ b/data/examples/corpd/word/numbering.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/settings.xml b/data/examples/corpd/word/settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..ec1ec079cab22fd4892dea29dec9b1d5ccd03261 --- /dev/null +++ b/data/examples/corpd/word/settings.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/styles.xml b/data/examples/corpd/word/styles.xml new file mode 100644 index 0000000000000000000000000000000000000000..e0abe63fdebd8f3410c4fc376b9170aa2f8f42c4 --- /dev/null +++ b/data/examples/corpd/word/styles.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/theme/theme1.xml b/data/examples/corpd/word/theme/theme1.xml new file mode 100644 index 0000000000000000000000000000000000000000..5d801b6c8939358854ad57e1f0e528826c58b89b --- /dev/null +++ b/data/examples/corpd/word/theme/theme1.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corpd/word/webSettings.xml b/data/examples/corpd/word/webSettings.xml new file mode 100644 index 0000000000000000000000000000000000000000..92bceda3319c3d17490e271ec16d0a63b9f4b883 --- /dev/null +++ b/data/examples/corpd/word/webSettings.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/corporate_simple.docx b/data/examples/corporate_simple.docx new file mode 100644 index 0000000000000000000000000000000000000000..6cf9e77f3b942e2e4bbd936f5e03aa9cf37656f1 Binary files /dev/null and b/data/examples/corporate_simple.docx differ diff --git a/data/examples/corporate_simple_newBody.docx b/data/examples/corporate_simple_newBody.docx new file mode 100644 index 0000000000000000000000000000000000000000..d9c31ca18e34469deaab6c13a90066bbcd8f1d84 Binary files /dev/null and b/data/examples/corporate_simple_newBody.docx differ diff --git a/data/examples/corporate_simple_titre1_modif.docx b/data/examples/corporate_simple_titre1_modif.docx new file mode 100644 index 0000000000000000000000000000000000000000..d028c28f9e59579d08c01597c646950ca9112741 Binary files /dev/null and b/data/examples/corporate_simple_titre1_modif.docx differ diff --git a/data/examples/docProps/app.xml b/data/examples/docProps/app.xml new file mode 100644 index 0000000000000000000000000000000000000000..118dc89eff84ee478d2756570b5d6a43a7ad73be --- /dev/null +++ b/data/examples/docProps/app.xml @@ -0,0 +1,2 @@ + +211480Microsoft Office Word011falsefalse93falsefalse16.0000 \ No newline at end of file diff --git a/data/examples/docProps/core.xml b/data/examples/docProps/core.xml new file mode 100644 index 0000000000000000000000000000000000000000..1c239f6c345e7e993453a1c296dff336ba2635cf --- /dev/null +++ b/data/examples/docProps/core.xml @@ -0,0 +1,2 @@ + +laura peligrylaura peligry22023-06-27T07:01:00Z2023-07-06T08:37:00Z \ No newline at end of file diff --git a/data/examples/word/_rels/document.xml.rels b/data/examples/word/_rels/document.xml.rels new file mode 100644 index 0000000000000000000000000000000000000000..c4308925acfd5ae11927e3f6a033c9d6bf8121ea --- /dev/null +++ b/data/examples/word/_rels/document.xml.rels @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/word/document.xml b/data/examples/word/document.xml new file mode 100644 index 0000000000000000000000000000000000000000..939734d238a3f2057720733adb6f27a93f00376e --- /dev/null +++ b/data/examples/word/document.xml @@ -0,0 +1,2 @@ + +Ceci est un titre 1Et ceci un titre 2Et enfin un titre 3Et là du normalEt là du newBody2 \ No newline at end of file diff --git a/data/examples/word/fontTable.xml b/data/examples/word/fontTable.xml new file mode 100644 index 0000000000000000000000000000000000000000..27aba94e9b6fc8884fc775c562ddd6ab261238d0 --- /dev/null +++ b/data/examples/word/fontTable.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/word/numbering.xml b/data/examples/word/numbering.xml new file mode 100644 index 0000000000000000000000000000000000000000..75809d495d4dc4de0d903c5be86d162d96c0da1b --- /dev/null +++ b/data/examples/word/numbering.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/word/settings.xml b/data/examples/word/settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..4873b406c40be17303f133d16edd8922d4ffbaf6 --- /dev/null +++ b/data/examples/word/settings.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/word/styles.xml b/data/examples/word/styles.xml new file mode 100644 index 0000000000000000000000000000000000000000..2e0eef42b73299cc1081509c824344cc411f14fa --- /dev/null +++ b/data/examples/word/styles.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/word/theme/theme1.xml b/data/examples/word/theme/theme1.xml new file mode 100644 index 0000000000000000000000000000000000000000..27e7bdab38772c5dbbfc1a7dcbc03397953a3430 --- /dev/null +++ b/data/examples/word/theme/theme1.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/word/webSettings.xml b/data/examples/word/webSettings.xml new file mode 100644 index 0000000000000000000000000000000000000000..67b7983172279b25fca138d82da55825a377ae7b --- /dev/null +++ b/data/examples/word/webSettings.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/data/examples/word_simple.docx b/data/examples/word_simple.docx new file mode 100644 index 0000000000000000000000000000000000000000..1395b80baf6ef9db48aad4bb22e68a6c23de7671 Binary files /dev/null and b/data/examples/word_simple.docx differ diff --git a/data/examples/word_simple_.docx b/data/examples/word_simple_.docx new file mode 100644 index 0000000000000000000000000000000000000000..914fb8d93b290422a6dd36dba1ebb25bcab00e7e Binary files /dev/null and b/data/examples/word_simple_.docx differ diff --git a/data/examples/~$doMoro_simple.docx b/data/examples/~$doMoro_simple.docx new file mode 100644 index 0000000000000000000000000000000000000000..1215360c558324f168a348f977b46d5a160ee437 Binary files /dev/null and b/data/examples/~$doMoro_simple.docx differ diff --git a/data/examples/~$rpTemplate .docx b/data/examples/~$rpTemplate .docx new file mode 100644 index 0000000000000000000000000000000000000000..99076fb99e3f3f132cc4073259a7a7b38ad6581e Binary files /dev/null and b/data/examples/~$rpTemplate .docx differ diff --git a/data/templates/CorpTemplate .docx b/data/templates/CorpTemplate .docx new file mode 100644 index 0000000000000000000000000000000000000000..d0bfc374b58fda89a817b0161d53516a150f0052 Binary files /dev/null and b/data/templates/CorpTemplate .docx differ diff --git a/data/templates/Corporate Template Fr.docx b/data/templates/Corporate Template Fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..7af4fca2abc74ca0cded449cfce9a99a9c587cf4 Binary files /dev/null and b/data/templates/Corporate Template Fr.docx differ diff --git a/data/templates/Corporate Template Fr.dotx b/data/templates/Corporate Template Fr.dotx new file mode 100644 index 0000000000000000000000000000000000000000..32359e172915c46177c44d33a5b0a3f33a0201cb Binary files /dev/null and b/data/templates/Corporate Template Fr.dotx differ diff --git a/data/templates/Corporate Template Green Fr.docx b/data/templates/Corporate Template Green Fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..aa82fc7318de953712e3f2ec4bb3e0bdccb4a884 Binary files /dev/null and b/data/templates/Corporate Template Green Fr.docx differ diff --git a/data/templates/Corporate Template Red Fr.docx b/data/templates/Corporate Template Red Fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..d9b465aec48f8158bc4c0e644893882981631ff5 Binary files /dev/null and b/data/templates/Corporate Template Red Fr.docx differ diff --git a/data/templates/~$rporate Template Fr.docx b/data/templates/~$rporate Template Fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..1215360c558324f168a348f977b46d5a160ee437 Binary files /dev/null and b/data/templates/~$rporate Template Fr.docx differ diff --git a/data/templates/~$rporate Template Fr.dotx b/data/templates/~$rporate Template Fr.dotx new file mode 100644 index 0000000000000000000000000000000000000000..1215360c558324f168a348f977b46d5a160ee437 Binary files /dev/null and b/data/templates/~$rporate Template Fr.dotx differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e76586d79df5542c5a0a943599d0eb2937a43bdc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,105 @@ +aiofiles==23.2.1 +aiohttp==3.8.5 +aiosignal==1.3.1 +altair==5.1.1 +annotated-types==0.5.0 +anyio==3.7.1 +async-timeout==4.0.3 +attrs==23.1.0 +backoff==2.2.1 +bcrypt==4.0.1 +beautifulsoup4==4.12.2 +certifi==2023.7.22 +charset-normalizer==3.2.0 +chroma-hnswlib==0.7.2 +chromadb==0.4.8 +click==8.1.7 +coloredlogs==15.0.1 +contourpy==1.1.0 +cycler==0.11.0 +dataclasses-json==0.5.14 +exceptiongroup==1.1.3 +fastapi==0.99.1 +ffmpy==0.3.1 +filelock==3.12.3 +flatbuffers==23.5.26 +fonttools==4.42.1 +frozenlist==1.4.0 +fsspec==2023.9.0 +gradio==3.33.1 +gradio_client==0.5.0 +h11==0.14.0 +httpcore==0.17.3 +httptools==0.6.0 +httpx==0.24.1 +huggingface-hub==0.16.4 +humanfriendly==10.0 +idna==3.4 +importlib-resources==6.0.1 +Jinja2==3.1.2 +jsonschema==4.19.0 +jsonschema-specifications==2023.7.1 +kiwisolver==1.4.5 +langchain==0.0.279 +langsmith==0.0.33 +linkify-it-py==2.0.2 +lxml==4.9.3 +markdown-it-py==2.2.0 +MarkupSafe==2.1.3 +marshmallow==3.20.1 +matplotlib==3.7.2 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +monotonic==1.6 +mpmath==1.3.0 +multidict==6.0.4 +mypy-extensions==1.0.0 +numexpr==2.8.5 +numpy==1.25.2 +onnxruntime==1.15.1 +openai==0.28.0 +orjson==3.9.5 +overrides==7.4.0 +packaging==23.1 +pandas==2.1.0 +Pillow==10.0.0 +posthog==3.0.2 +protobuf==4.24.2 +pulsar-client==3.3.0 +pydantic==1.10.12 +pydantic_core==2.6.3 +pydub==0.25.1 +Pygments==2.16.1 +pyparsing==3.0.9 +PyPika==0.48.9 +python-dateutil==2.8.2 +python-docx==0.8.11 +python-dotenv==1.0.0 +python-multipart==0.0.6 +pytz==2023.3 +PyYAML==6.0.1 +referencing==0.30.2 +requests==2.31.0 +rpds-py==0.10.0 +semantic-version==2.10.0 +six==1.16.0 +sniffio==1.3.0 +soupsieve==2.5 +SQLAlchemy==2.0.20 +starlette==0.27.0 +sympy==1.12 +tenacity==8.2.3 +tokenizers==0.13.3 +toolz==0.12.0 +tqdm==4.66.1 +typing-inspect==0.9.0 +typing_extensions==4.7.1 +tzdata==2023.3 +uc-micro-py==1.0.2 +urllib3==2.0.4 +uvicorn==0.23.2 +uvloop==0.17.0 +watchfiles==0.20.0 +websockets==11.0.3 +wikipedia==1.4.0 +yarl==1.9.2 diff --git a/src/control/__pycache__/control.cpython-310.pyc b/src/control/__pycache__/control.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09e35280cb884e90e32bd06869535cb05336cfbc Binary files /dev/null and b/src/control/__pycache__/control.cpython-310.pyc differ diff --git a/src/control/__pycache__/control.cpython-311.pyc b/src/control/__pycache__/control.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9cb7f218db3db83672a48250ce39a9cd7e93958 Binary files /dev/null and b/src/control/__pycache__/control.cpython-311.pyc differ diff --git a/src/control/control.py b/src/control/control.py new file mode 100644 index 0000000000000000000000000000000000000000..79ca1005ab87b22df5d6cdfdb7fbe8fe9598ed34 --- /dev/null +++ b/src/control/control.py @@ -0,0 +1,158 @@ +from typing import Dict +import random +import datetime +import string + +from src.domain.doc import Doc +from src.domain.wikidoc import WikiPage +from src.view.log_msg import create_msg_from +import src.tools.semantic_db as semantic_db +from src.tools.wiki import Wiki +from src.tools.llm_tools import get_wikilist, get_public_paragraph, get_private_paragraph +from src.tools.semantic_db import add_texts_to_collection, query_collection + + +class Controller: + + def __init__(self, config: Dict): + self.templates_path = config['templates_path'] + self.generated_docs_path = config['generated_docs_path'] + self.styled_docs_path = config['styled_docs_path'] + self.new_docs = [] + self.gen_docs = [] + + template_path = config['templates_path'] + '/' + config['templates'][config['default_template_index']] + self.default_template = Doc(template_path) + self.template = self.default_template + self.log = [] + self.differences = [] + + def copy_docs(self, temp_docs: []): + get_name = lambda doc: doc.name.split('/')[-1].split('.')[0] + doc_names = [get_name(doc) for doc in temp_docs] + docs = [Doc(path=doc.name) for doc in temp_docs] + style_paths = [f"{self.generated_docs_path}/{dn}_.docx" for dn in doc_names] + gen_paths = [f"{self.generated_docs_path}/{dn}_e.docx" for dn in doc_names] + for doc, style_path, gen_path in zip(docs, style_paths, gen_paths): + new_doc = doc.copy(style_path) + self.new_docs.append(new_doc) + + def clear_docs(self): + for new_doc in self.new_docs: + new_doc.clear() + for gen_doc in self.gen_docs: + gen_doc.clear() + self.new_docs = [] + self.gen_docs = [] + self.log = [] + + def set_template(self, template_name: str = ""): + if not template_name: + self.template = self.default_template + else: + template_path = f"{self.templates_path}/{template_name}" + self.template = Doc(template_path) + + def get_difference_with_template(self): + self.differences = [] + for new_doc in self.new_docs: + diff_styles = new_doc.get_different_styles_with_template(template=self.template) + diff_dicts = [{'doc': new_doc, 'style': s} for s in diff_styles] + self.differences += diff_dicts + template_styles = [name for name in self.template.styles.names if name.startswith('.')] + return self.differences, template_styles + + def map_style(self, this_style_index: int, template_style_name: str): + """ + maps a style from 'this' document into a style from the template + """ + diff_dict = self.differences[this_style_index] + doc = diff_dict['doc'] + this_style_name = diff_dict['style'] + log = doc.copy_one_style(this_style_name, template_style_name, self.template) + self.log.append({doc.name: log}) + + def apply_template(self, add_front_pages: bool): + for new_doc in self.new_docs: + log = new_doc.apply_template(template=self.template, add_front_pages=add_front_pages) + if log: + self.log.append({new_doc.name: log}) + + def reset(self): + for new_doc in self.new_docs: + new_doc.delete() + for gen_doc in self.gen_docs: + gen_doc.delete() + self.new_docs = [] + self.gen_docs = [] + + + def get_log(self): + msg_log = create_msg_from(self.log, self.new_docs) + return msg_log + + """ + Source Control + """ + + def get_or_create_collection(self, id_: str) -> str: + """ + generates a new id if needed + """ + if id_ != '-1': + return id_ + else: + now = datetime.datetime.now().strftime("%m%d%H%M") + letters = string.ascii_lowercase + string.digits + id_ = now + '-' + ''.join(random.choice(letters) for _ in range(10)) + semantic_db.get_or_create_collection(id_) + return id_ + + def wiki_fetch(self) -> [str]: + """ + returns the title of the wikipages corresponding to the tasks described in the input text + """ + all_tasks = [] + for new_doc in self.new_docs: + all_tasks += new_doc.tasks + wiki_lists = [get_wikilist(t) for t in all_tasks] + flatten_wiki_list = list(set().union(*[set(w) for w in wiki_lists])) + return flatten_wiki_list + + async def wiki_upload_and_store(self, wiki_title: str, collection_name: str): + """ + uploads one wikipage and stores them into the right collection + """ + wikipage = Wiki().fetch(wiki_title) + wiki_title = wiki_title + if type(wikipage) != str: + texts = WikiPage(wikipage.page_content).get_paragraphs() + add_texts_to_collection(coll_name=collection_name, texts=texts, file=wiki_title, source='wiki') + else: + print(wikipage) + + """ + Generate Control + """ + + def generate_doc_from_db(self, collection_name: str, from_files: [str]) -> [str]: + + def query_from_task(task): + return get_public_paragraph(task) + + gen_paths = [] + + for new_doc in self.new_docs: + queries = [query_from_task(t) for t in new_doc.tasks] + texts_list = [query_collection(coll_name=collection_name, query=q, from_files=from_files) for q in queries] + task_resolutions = [get_private_paragraph(task=task, texts=texts) + for task, texts in zip(new_doc.tasks, texts_list)] + + gen_path = f"{self.generated_docs_path}/{new_doc.name}e.docx" + gen_doc = new_doc.copy(gen_path) + + gen_doc.replace_tasks(task_resolutions) + gen_doc.save_as_docx() + gen_paths.append(gen_doc.path) + self.gen_docs.append(gen_doc) + return gen_paths diff --git a/src/domain/__pycache__/block.cpython-310.pyc b/src/domain/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ea0c9f170c00f93a210ad75489c37c6583dc6eb Binary files /dev/null and b/src/domain/__pycache__/block.cpython-310.pyc differ diff --git a/src/domain/__pycache__/container.cpython-310.pyc b/src/domain/__pycache__/container.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44b7c3b8924bb264ddd0931c54b7159873d61cba Binary files /dev/null and b/src/domain/__pycache__/container.cpython-310.pyc differ diff --git a/src/domain/__pycache__/doc.cpython-310.pyc b/src/domain/__pycache__/doc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a93dbacc2757f143c426d486ba2e16db0da26b3b Binary files /dev/null and b/src/domain/__pycache__/doc.cpython-310.pyc differ diff --git a/src/domain/__pycache__/doc.cpython-311.pyc b/src/domain/__pycache__/doc.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64acbe349d91348f42b95bd27e50f5fd86e69629 Binary files /dev/null and b/src/domain/__pycache__/doc.cpython-311.pyc differ diff --git a/src/domain/__pycache__/paragraph.cpython-310.pyc b/src/domain/__pycache__/paragraph.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e366985bcaa65b9550939ff3c7fc1f8bf0d13650 Binary files /dev/null and b/src/domain/__pycache__/paragraph.cpython-310.pyc differ diff --git a/src/domain/__pycache__/styles.cpython-310.pyc b/src/domain/__pycache__/styles.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bee2757231c0fc73fc093ff1c991e6a92e9b1030 Binary files /dev/null and b/src/domain/__pycache__/styles.cpython-310.pyc differ diff --git a/src/domain/__pycache__/wikidoc.cpython-310.pyc b/src/domain/__pycache__/wikidoc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f919f201895bc498b6f60ba71d59b0b256b231d3 Binary files /dev/null and b/src/domain/__pycache__/wikidoc.cpython-310.pyc differ diff --git a/src/domain/block.py b/src/domain/block.py new file mode 100644 index 0000000000000000000000000000000000000000..30e611ec389531f86b5e1143cb39382cb77f4a70 --- /dev/null +++ b/src/domain/block.py @@ -0,0 +1,49 @@ +class Block: + def __init__(self, doc: str = '', title: str = '', content: str = '', content_fr: str = '', + index: str = '', rank: int = 0, level: int = 0, distance: float = 99999): + self.doc = doc + self.title = title + self.title_fr = "" + self.content = content + self.content_fr = content_fr + self.specials = [] + self.index = index + self.rank = rank + self.level = level + self.distance = distance + + def to_dict(self) -> {}: + block_dict = {'doc': self.doc, + 'title': self.title, + 'title_fr': self.title_fr, + 'content': self.content, + 'content_fr': self.content_fr, + 'index': self.index, + 'rank': self.rank, + 'level': self.level, + 'distance': self.distance} + for i, s in enumerate(self.specials): + special_key = 'special_'+str(i) + block_dict[special_key] = s + block_dict['specials_len'] = len(self.specials) + return block_dict + + def from_dict(self, block_dict: {}): + self.doc = block_dict['doc'] + self.title = block_dict['title'] + self.title_fr = block_dict['title_fr'] + self.content = block_dict['content'] + self.content_fr = block_dict['content_fr'] + self.index = block_dict['index'] + self.rank = block_dict['rank'] + self.level = block_dict['level'] + self.distance = block_dict['distance'] + self.specials = [] + for i in range(block_dict['specials_len']): + special_key = 'special_' + str(i) + self.specials.append(block_dict[special_key]) + return self + + @property + def distance_str(self) -> str: + return format(self.distance, '.2f') diff --git a/src/domain/container.py b/src/domain/container.py new file mode 100644 index 0000000000000000000000000000000000000000..2271d3a644f6b6c3727437ab1c5fefcd7aa69904 --- /dev/null +++ b/src/domain/container.py @@ -0,0 +1,178 @@ +from src.domain.paragraph import Paragraph +from src.domain.block import Block + +INFINITE = 10000 + + +class Container: + + def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, index: [int] = None, + father=None, id_=0): + if index is None: + index = [] + self.level = level + if not self.level: + pass + self.title = title + self.paragraphs = [] + self.all_paragraphs = paragraphs + self.children = [] + self.index = index + self.father = father # if not father, then the container is at the top of the hierarchy + self.id_ = int(str(1) + str(father.id_) + str(id_)) + if paragraphs: + self.paragraphs, self.children = self.create_children(paragraphs.copy(), level, index) + self.containers = [self] + for child in self.children: + self.containers += child.containers + self.blocks = self.get_blocks() + self.normal, self.comment, self.task, _ = self.sort_paragraphs() + + self.one_liner = (self.title.text if self.title else '') + ' ' + self.comment + self.root_text = self.one_liner + ' ' + self.normal + + + @property + def text(self): + text = "" + if self.title: + text = "Titre " + str(self.level) + " : " + self.title.text + '\n' + for p in self.paragraphs: + text += p.text + '\n' + for child in self.children: + text += child.text + return text + + @property + def table_of_contents(self): + toc = [] + if self.title: + toc += [{str(self.level): self.title.text}] + if self.children: + for child in self.children: + toc += child.table_of_contents + return toc + + def move(self, position: int, new_father=None): + current_father = self.father # should be added in the domain + current_father.children.remove(self) + + self.rank = new_father.rank + 1 if new_father else 0 + self.father = new_father + if position < len(new_father.children): + new_father.children.insert(position, self) + else: + new_father.children.append(self) + + def create_children(self, paragraphs, level, rank) -> ([], []): + """ + creates children containers or directly attached content + and returns the list of containers and contents of level+1 + :return: + [Content or Container] + """ + attached_paragraphs = [] + container_paragraphs = [] + container_title = None + children = [] + in_children = False + level = INFINITE + child_id = 0 + + while paragraphs: + p = paragraphs.pop(0) + if not in_children and not p.is_structure: + attached_paragraphs.append(p) + else: + in_children = True + if p.is_structure and p.level <= level: # if p is higher or equal in hierarchy + if container_paragraphs or container_title: + children.append(Container(container_paragraphs, container_title, level, rank, self, child_id)) + child_id += 1 + container_paragraphs = [] + container_title = p + level = p.level + + else: # p is strictly lower in hierarchy + container_paragraphs.append(p) + + if container_paragraphs or container_title: + children.append(Container(container_paragraphs, container_title, level, rank, self, child_id)) + child_id += 1 + + return attached_paragraphs, children + + @property + def structure(self): + + self_structure = {str(self.id_): { + 'index': str(self.id_), + 'canMove': True, + 'isFolder': True, + 'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children], + 'canRename': True, + 'data': {}, + 'level': self.level, + 'title': self.title.text if self.title else 'root' + }} + paragraphs_structure = [p.structure for p in self.paragraphs] + structure = [self_structure] + paragraphs_structure + for child in self.children: + structure += child.structure + return structure + + def get_lang(self): + """ + returns the main language of the document + :return: + """ + + def get_structure(self, level=2): + """ + returns the structure of the document + :return: + """ + + def create_embeddings(self): + """ + + :return: + """ + + def get_blocks(self): + block = Block(level=self.level, index=self.index) + if self.title: + block.title = self.title.text + for p in self.paragraphs: + if not p.blank: + if p.text.startswith('##### '): + special_action = p.text.lstrip('##### ') + block.specials.append(special_action) + else: + block.content += p.text + blocks = [block] if block.content or block.specials else [] + for child in self.children: + blocks += child.blocks + return blocks + + def get_fulltask(self, doc_one_liner): + siblings_ = self.father.children.copy() + index = siblings_.index(self) + siblings_before_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if idx < index] + siblings_after_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if index < idx] + + fulltask = {'description': self.task, + 'about': self.one_liner, + 'doc_description': doc_one_liner, + 'above': self.father.one_liner, + 'before': siblings_before_context, + 'after': siblings_after_context} + return fulltask + + def sort_paragraphs(self) -> (str, str, str, str): + mapping = {'normal': '', 'comment': '', 'task': '', 'title': ''} + for p in self.paragraphs: + mapping[p.type] += ' ' + p.parsed_text + return mapping['normal'], mapping['comment'], mapping['task'], mapping['title'] + + diff --git a/src/domain/doc.py b/src/domain/doc.py new file mode 100644 index 0000000000000000000000000000000000000000..9829b9235c0a6d47e6e214ec8e7714c95d6d20d5 --- /dev/null +++ b/src/domain/doc.py @@ -0,0 +1,95 @@ +import docx + +from src.domain.container import Container +from src.domain.paragraph import Paragraph +from src.domain.styles import Styles +import shutil +import os + + +class Doc: + + def __init__(self, path='', id_=None): + + self.xdoc = docx.Document(path) + self.title = path.split('/')[-1] + self.name = self.title.split('.')[0] + self.id_ = id(self) + self.path = path + paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)] + self.container = Container(paragraphs, father=self) + self.styles = Styles(self.xdoc.styles) + self.tasks = [c.get_fulltask(self.container.one_liner) for c in self.container.containers if c.task] + + def copy(self, new_doc_path): + shutil.copyfile(self.path, new_doc_path) + new_doc = Doc(new_doc_path) + new_doc.save_as_docx(new_doc_path) + return new_doc + + def clear(self): + os.remove(self.path) + + def apply_template(self, template, add_front_pages=True): + if add_front_pages: + self.add_front_pages_from(template) + log = self.styles.apply_from(template.styles) + self.save_as_docx() + return log + + def copy_one_style(self, src_style_name: str, dest_style_name: str, template): + style_dest = template.styles.get_style_from_name(dest_style_name) + src_style = self.styles.get_style_from_name(src_style_name) + log = self.styles.copy_one_style(src_style, style_dest) + return log + + def get_different_styles_with_template(self, template): + different_styles = self.styles.get_different_styles(template.styles) + return different_styles + + def save_as_docx(self, path: str = ''): + path = path if path else self.path + self.path = path + self.xdoc.save(path) + + def add_front_pages_from(self, src_doc): + src_paragraphs = [p for p in src_doc.xdoc.paragraphs] + src_paragraphs.reverse() + for p in src_paragraphs: + self.xdoc.paragraphs[0].insert_paragraph_before(text=p.text, style=p.style) + paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)] + self.container = Container(paragraphs, father=self) + + def get_blocks(self): + + def from_list_to_str(index_list): + index_str = str(index_list[0]) + for el in index_list[1:]: + index_str += '.' + str(el) + return index_str + + blocks = self.container.blocks + for block in blocks: + block.doc = self.title + if block.level == 0: + blocks.remove(block) + block.index = from_list_to_str(block.index) + return blocks + + + @property + def structure(self): + + return self.container.structure + + def replace_tasks(self, resolutions: [str]): + if len(resolutions) == len(self.tasks): # exception to be handled + p_tasks = [p for p in self.get_paragraphs() if p.type == 'task'] + for p, r in zip(p_tasks, resolutions): + p.set_text(r) + else: + print(f"résolutions : {len(resolutions)} != {len(self.tasks)} tasks") + return self + + def get_paragraphs(self): + return self.container.all_paragraphs diff --git a/src/domain/paragraph.py b/src/domain/paragraph.py new file mode 100644 index 0000000000000000000000000000000000000000..d49f8c21575999df1189ae14a386a94c5065e5c7 --- /dev/null +++ b/src/domain/paragraph.py @@ -0,0 +1,70 @@ +import string + +INFINITE = 10000 + + +class Paragraph: + + def __init__(self, xparagraph, doc_id: int, id_: int): + + self.xparagraph = xparagraph + self.id_ = int(str(2) + str(doc_id) + str(id_)) + style_name = self.xparagraph.style.name + self.level = self.get_level_from_name(style_name) + self.is_structure = self.level < INFINITE + self.text = self.xparagraph.text + self.type, self.parsed_text = self.parse_text() + + + @property + def structure(self): + structure = {str(self.id_): { + 'index': str(self.id_), + 'canMove': True, + 'isFolder': False, + 'children': [], + 'title': self.text, + 'canRename': True, + 'data': {}, + 'level': self.level, + }} + return structure + + @property + def blank(self): + """ + checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored) + """ + text = self.text.replace('\n', '') + return set(text).isdisjoint(string.ascii_letters) + + @staticmethod + def get_level_from_name(style_name: str) -> int: + level = INFINITE + if '.Titre' in style_name: + suffix = style_name[-1] + try: + level = int(suffix) + except: + pass + return level + + def parse_text(self) -> (str, str): + + if self.is_structure: + return 'structure', self.text + + startswith = {"?? ": "task", "++ ": "comment"} + for start in startswith.keys(): + split = self.text.rsplit(start) + if 1 < len(split): + return startswith[start], split[1] + + return "normal", self.text + + def set_text(self, text: str): + self.text = text + self.xparagraph.text = text + return self + + diff --git a/src/domain/styles.py b/src/domain/styles.py new file mode 100644 index 0000000000000000000000000000000000000000..17a6db208349ef0124940f7dfac5ded5f52acd64 --- /dev/null +++ b/src/domain/styles.py @@ -0,0 +1,134 @@ +from docx.enum.style import WD_STYLE_TYPE +from docx.shared import RGBColor + + +class Styles: + + def __init__(self, xstyles, doc_id=0, id_=0): + + self.id_ = int(str(doc_id)+str(id_)) + self.xstyles = xstyles + self.names = [s.name for s in xstyles] + + @staticmethod + def copy_style(src=None, dest=None) -> {}: + modified_style = set() + if src.type == WD_STYLE_TYPE.PARAGRAPH: + same_color = True + if src.font.color.rgb: + dest_rgb = RGBColor(src.font.color.rgb[0], src.font.color.rgb[1], src.font.color.rgb[2]) + if dest.font.color.rgb: + for i in range(3): + same_color *= dest.font.color.rgb[i] == dest_rgb[i] + else: + same_color = False + dest.font.color.rgb = dest_rgb + else: + if dest.font.color.rgb: + same_color = False + if not same_color: + modified_style.add(('color', True)) + + if dest.font.size != src.font.size: + dest.font.size = src.font.size + modified_style.add(('font size', (src.font.size, dest.font.size))) + + if dest.font.name != src.font.name: + dest.font.name = src.font.name + modified_style.add(('font', (src.font.name, dest.font.name))) + + if dest.font.all_caps != src.font.all_caps: + dest.font.all_caps = src.font.all_caps + modified_style.add(('all_caps', (src.font.all_caps, dest.font.all_caps))) + + if dest.font.bold != src.font.bold: + dest.font.bold = src.font.bold + modified_style.add(('bold', (src.font.bold, dest.font.bold))) + + dest.font.complex_script = src.font.complex_script + dest.font.cs_bold = src.font.cs_bold + dest.font.cs_italic = src.font.cs_italic + dest.font.double_strike = src.font.double_strike + dest.font.emboss = src.font.emboss + dest.font.hidden = src.font.hidden + dest.font.highlight_color = src.font.highlight_color + dest.font.imprint = src.font.imprint + dest.font.italic = src.font.italic + dest.font.math = src.font.math + dest.font.no_proof = src.font.no_proof + dest.font.outline = src.font.outline + dest.font.rtl = src.font.rtl + dest.font.shadow = src.font.shadow + dest.font.small_caps = src.font.small_caps + dest.font.snap_to_grid = src.font.snap_to_grid + dest.font.spec_vanish = src.font.spec_vanish + dest.font.strike = src.font.strike + dest.font.subscript = src.font.subscript + dest.font.superscript = src.font.superscript + dest.font.underline = src.font.underline + dest.font.web_hidden = src.font.web_hidden + dest.base_style = src.base_style + dest.hidden = src.hidden + dest.locked = src.locked + dest.name = src.name + dest.priority = src.priority + dest.quick_style = src.quick_style + dest.unhide_when_used = src.unhide_when_used + return modified_style + + def apply_from(self, template_styles): + + log = {'suppressed_styles': [], 'modified_styles': [], 'added_styles': []} + + for s in self.xstyles: + if s.name not in template_styles.names: + log['suppressed_styles'].append(s.name) + s.delete() + else: + src_style = template_styles.get_style_from_name(s.name) + log_s = self.copy_style(src=src_style, dest=s) + if log_s: + log['modified_styles'].append((s.name, log_s)) + + for s in template_styles.xstyles: + if not self.contains_style(s): + log['added_styles'].append(s.name) + self.xstyles.add_style(s.name, s.type) + self.copy_style(src=s, dest=self.xstyles[s.name]) + return log + + def get_different_styles(self, other_styles) -> [str]: + different_styles_names = [s.name for s in self.xstyles if s.name not in other_styles.names] + return different_styles_names + + def copy_one_style(self, src_style, dest_style) -> {}: + log_msg = \ + f"le style {src_style.name} a été mappé sur le style {dest_style.name} du template" + log_dict = {'style_mapping': log_msg} + self.copy_style(src_style, dest_style) + return log_dict + + def get_style_from_name(self, name: str): + try: + s = self.xstyles[name] + except: + try: + s = self.xstyles[name[1:]] + except: + s = self.get_style_from_name(self.names[0]) + print('??') + return s + + def contains_style(self, style): + resp = True + try: + s = self.xstyles[style.name] + except: + try: + s = self.xstyles[style.name[1:]] + except: + resp = False + return resp + + + diff --git a/src/domain/wikidoc.py b/src/domain/wikidoc.py new file mode 100644 index 0000000000000000000000000000000000000000..25903666909e776182ab45beebdb99e44d4fb8a7 --- /dev/null +++ b/src/domain/wikidoc.py @@ -0,0 +1,123 @@ +class Doc: + def __init__(self, fulltext: str = '', title: str = '', params: dict = {}): + self.params = params + self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()] + self.title, self.lines = self._get_title(title) + self.container = Container(lines=self.lines, title=self.title, father=self, params=params) + self.fulltext = fulltext + + def _get_title(self, title): + lines = self.lines + if self.params['type'] == 'input_text': + if self.lines and self.lines[0] and self.lines[0].type == 'title': + title = self.lines[0].text + lines = lines[1:] + else: + title = 'the title is missing' + return title, lines + + +class WikiPage(Doc): + + def __init__(self, fulltext='', title=''): + self.params = { + 'type': 'wiki', + 'startswith_': + {'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'}, + 'endswith_': + [' ==', ' ===', ' ====', ' =====', ' ======', ' ======'], + + 'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography", + "Cinematic adaptations", "Further reading", "Maps"] + } + super().__init__(fulltext=fulltext, title=title, params=self.params) + + def get_paragraphs(self, chunk=500): + return self.container.get_paragraphs(chunk) + + +class Container: + + def __init__(self, lines=[], level=0, title='', father=None, params={}): + + self.children = [] + self.level = level + self.title = title + self.father = father + self.lines = [] + self._expand(lines) + if params and 'discarded' in params.keys(): + self.children = [child for child in self.children if child.title not in params['discarded']] + self.containers = [self] + for child in self.children: + self.containers += child.containers + self.text = '' + for child in self.children: + self.text += ' ' + child.text + + def _expand(self, lines): + new_child = False + new_child_lines = [] + new_child_title = [] + for line in lines: + if not new_child: + if line.is_structure: + new_child = True + new_child_lines = [] + new_child_title = line.text + line.level = self.level + 1 + else: + self.lines.append(line) + + else: + if self.level + 1 < line.level or not line.is_structure: + new_child_lines.append(line) + elif self.level + 1 == line.level: + self.children.append(Container(lines=new_child_lines, + level=self.level + 1, + title=new_child_title, + father=self)) + new_child_lines = [] + new_child_title = line.text + if new_child: + self.children.append(Container(lines=new_child_lines, + level=self.level + 1, + title=new_child_title, + father=self)) + + def get_paragraphs(self, chunk=500): + if len(self.text) < chunk: + paragraphs = [self.text] + else: + paragraphs = [self.root_text] + for child in self.children: + paragraphs += child.get_paragraphs(chunk) + return paragraphs + + +class Line: + + def __init__(self, text, params): + self.text = text + self.params = params + self.type, self.text = self._parse_text() + self.level = int(self.type) if self.type.isdigit() else -1 + self.is_structure = 0 < self.level + + + def _parse_text(self): + def strip_text(text_, start, end): + text_ = text_.split(start)[1] + if end != "": + text_ = text_.split(end)[0] + # text += ". \n" + return text_.strip() + + startswith_ = self.params['startswith_'] + + endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_) + types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter]) + for i, starter in enumerate(startswith_.keys()) + if self.text.startswith(starter)] + (text, type_) = types[0] if types else (self.text, 'normal') + return type_, text.strip() diff --git a/src/model/__pycache__/block.cpython-310.pyc b/src/model/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96eb7e0b0fb0858198eb69e10691efc2c19fa183 Binary files /dev/null and b/src/model/__pycache__/block.cpython-310.pyc differ diff --git a/src/model/__pycache__/container.cpython-310.pyc b/src/model/__pycache__/container.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3ad436aa5adbd3c45488b8bbfa9ffcbf1f5f417 Binary files /dev/null and b/src/model/__pycache__/container.cpython-310.pyc differ diff --git a/src/model/__pycache__/doc.cpython-310.pyc b/src/model/__pycache__/doc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..713be0de866d0c6770a9d04069fd09dc4ef4fbd6 Binary files /dev/null and b/src/model/__pycache__/doc.cpython-310.pyc differ diff --git a/src/model/__pycache__/paragraph.cpython-310.pyc b/src/model/__pycache__/paragraph.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ff8d999d0c60a13434c899c46f793144b6a355d Binary files /dev/null and b/src/model/__pycache__/paragraph.cpython-310.pyc differ diff --git a/src/model/block.py b/src/model/block.py new file mode 100644 index 0000000000000000000000000000000000000000..30e611ec389531f86b5e1143cb39382cb77f4a70 --- /dev/null +++ b/src/model/block.py @@ -0,0 +1,49 @@ +class Block: + def __init__(self, doc: str = '', title: str = '', content: str = '', content_fr: str = '', + index: str = '', rank: int = 0, level: int = 0, distance: float = 99999): + self.doc = doc + self.title = title + self.title_fr = "" + self.content = content + self.content_fr = content_fr + self.specials = [] + self.index = index + self.rank = rank + self.level = level + self.distance = distance + + def to_dict(self) -> {}: + block_dict = {'doc': self.doc, + 'title': self.title, + 'title_fr': self.title_fr, + 'content': self.content, + 'content_fr': self.content_fr, + 'index': self.index, + 'rank': self.rank, + 'level': self.level, + 'distance': self.distance} + for i, s in enumerate(self.specials): + special_key = 'special_'+str(i) + block_dict[special_key] = s + block_dict['specials_len'] = len(self.specials) + return block_dict + + def from_dict(self, block_dict: {}): + self.doc = block_dict['doc'] + self.title = block_dict['title'] + self.title_fr = block_dict['title_fr'] + self.content = block_dict['content'] + self.content_fr = block_dict['content_fr'] + self.index = block_dict['index'] + self.rank = block_dict['rank'] + self.level = block_dict['level'] + self.distance = block_dict['distance'] + self.specials = [] + for i in range(block_dict['specials_len']): + special_key = 'special_' + str(i) + self.specials.append(block_dict[special_key]) + return self + + @property + def distance_str(self) -> str: + return format(self.distance, '.2f') diff --git a/src/model/container.py b/src/model/container.py new file mode 100644 index 0000000000000000000000000000000000000000..29f64744b670da5977a09124a965da92812c2781 --- /dev/null +++ b/src/model/container.py @@ -0,0 +1,143 @@ +from src.model.paragraph import Paragraph +from src.model.block import Block + +INFINITE = 99999 + + +class Container: + + def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, index: [int] = None, + father=None, id_=0): + if index is None: + index = [] + self.level = level + self.title = title + self.paragraphs = [] + self.children = [] + self.index = index + self.father = father # if not father, then the container is at the top of the hierarchy + self.id_ = int(str(1) + str(father.id_) + str(id_)) + if paragraphs: + self.paragraphs, self.children = self.create_children(paragraphs, level, index) + self.blocks = self.get_blocks() + self.normals, self.comments, self.tasks = self.sort_paragraphs() + + + @property + def text(self): + text = "" + if self.title: + text = "Titre " + str(self.level) + " : " + self.title.text + '\n' + for p in self.paragraphs: + text += p.text + '\n' + for child in self.children: + text += child.text + return text + + @property + def text_chunks(self, chunk=500): + text_chunks = [] + text_chunk = "" + for p in self.paragraphs: + if chunk < len(text_chunk) + len(p.text): + text_chunks.append(text_chunk) + text_chunk = "" + else: + text_chunk += " " + p.text + if text_chunk and not text_chunk.isspace(): + text_chunks.append(text_chunk) + for child in self.children: + text_chunks += child.text_chunks + return text_chunks + + def get_blocks(self): + block = Block(level=self.level, index=self.index) + if self.title: + block.title = self.title.text + for p in self.paragraphs: + if not p.blank: + if p.text.startswith('##### '): + special_action = p.text.lstrip('##### ') + block.specials.append(special_action) + else: + block.content += p.text + blocks = [block] if block.content or block.specials else [] + for child in self.children: + blocks += child.blocks + return blocks + + def create_children(self, paragraphs: Paragraph, level: int, index: [int]) -> ([Paragraph], []): + """ + creates children containers or directly attached content + and returns the list of containers and contents of level+1 + :return: + [Content or Container] + """ + attached_paragraphs = [] + container_paragraphs = [] + container_title = None + children = [] + in_children = False + child_id = 0 + level = INFINITE + + while paragraphs: + p = paragraphs.pop(0) + if not in_children and not p.is_structure: + attached_paragraphs.append(p) + else: + in_children = True + if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed + if container_paragraphs or container_title: + if level <= len(index): + index = index[:level] + index[-1] += 1 + else: + for i in range(level-len(index)): + index.append(1) + children.append(Container(container_paragraphs, container_title, level, index, self, child_id)) + child_id += 1 + container_paragraphs = [] + container_title = p + level = p.level + + else: # p is normal text or strictly lower in hierarchy, then the child continues to grow + container_paragraphs.append(p) + + if container_paragraphs or container_title: + if level <= len(index): + index = index[:level] + index[-1] += 1 + else: + for i in range(level - len(index)): + index.append(1) + children.append(Container(container_paragraphs, container_title, level, index, self, child_id)) + child_id += 1 + + return attached_paragraphs, children + + @property + def structure(self): + + self_structure = {str(self.id_): { + 'index': str(self.id_), + 'canMove': True, + 'isFolder': True, + 'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children], + 'canRename': True, + 'data': {}, + 'level': self.level, + 'rank': self.rank, + 'title': self.title.text if self.title else 'root' + }} + paragraphs_structure = [p.structure for p in self.paragraphs] + structure = [self_structure] + paragraphs_structure + for child in self.children: + structure += child.structure + return structure + + def sort_paragraphs(self) -> ([Paragraph], [Paragraph], [Paragraph]): + mapping = {'normal': [], 'comment': [], 'task': []} + for p in self.paragraphs: + mapping(p.type).append(p) + return mapping['normal'], mapping['comment'], mapping['task'] diff --git a/src/model/doc.py b/src/model/doc.py new file mode 100644 index 0000000000000000000000000000000000000000..14a938eff9c5065a5a027bb1d6f55645a917d885 --- /dev/null +++ b/src/model/doc.py @@ -0,0 +1,54 @@ +import docx + +from src.model.container import Container +from src.model.paragraph import Paragraph + + +class Doc: + + def __init__(self, path='', id_=None): + + self.xdoc = docx.Document(path) + self.title = path.split('/')[-1] + self.id_ = id(self) + self.path = path + paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)] + self.container = Container(paragraphs, father=self, level=0) + self.blocks = self.get_blocks() + self.tasks = [c.get_task(self.container.one_liner) for c in self.container.containers if c.task] + + @property + def structure(self): + + return self.container.structure + + def get_blocks(self): + + def from_list_to_str(index_list): + index_str = str(index_list[0]) + for el in index_list[1:]: + index_str += '.' + str(el) + return index_str + + blocks = self.container.blocks + for block in blocks: + block.doc = self.title + if block.level == 0: + blocks.remove(block) + block.index = from_list_to_str(block.index) + return blocks +""" + current_level = len(current_index) + if 0 < block.level: + if block.level == current_level: + current_index[-1] += 1 + elif current_level < block.level: + current_index.append(1) + elif block.level < current_level: + current_index = current_index[:block.level] + current_index[-1] += 1 + block.index = from_list_to_str(current_index) + else: + block.index = "0" +""" + diff --git a/src/model/paragraph.py b/src/model/paragraph.py new file mode 100644 index 0000000000000000000000000000000000000000..cae2e67539f0d6542dc9c38b2244f080b4da5e07 --- /dev/null +++ b/src/model/paragraph.py @@ -0,0 +1,50 @@ +import string + +INFINITE = 10000 + + +class Paragraph: + + def __init__(self, xparagraph, doc_id: int, id_: int): + + self.xparagraph = xparagraph + self.id_ = int(str(2) + str(doc_id) + str(id_)) + self.level = self.get_level_from_name() + self.is_structure = self.level < INFINITE + self.text = self.xparagraph.text + self.type = self.get_type() + + @property + def structure(self): + structure = {str(self.id_): { + 'index': str(self.id_), + 'canMove': True, + 'isFolder': False, + 'children': [], + 'title': self.text, + 'canRename': True, + 'data': {}, + 'level': self.level, + }} + return structure + + @property + def blank(self): + """ + checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored) + """ + text = self.text.replace('\n', '') + return set(text).isdisjoint(string.ascii_letters) + + def get_level_from_name(self) -> int: + style_name = self.xparagraph.style.name + level = INFINITE + if '.Titre' in style_name: + suffix = style_name[-1] + try: + level = int(suffix) + except: + pass + return level + + diff --git a/src/tools/__pycache__/llm_tools.cpython-310.pyc b/src/tools/__pycache__/llm_tools.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffa88592725376975a403a4eeb2f8b54f1cd1b7a Binary files /dev/null and b/src/tools/__pycache__/llm_tools.cpython-310.pyc differ diff --git a/src/tools/__pycache__/llms.cpython-310.pyc b/src/tools/__pycache__/llms.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24b0b470be5d1789f02ab67e8d6b79a5ddf3a3c0 Binary files /dev/null and b/src/tools/__pycache__/llms.cpython-310.pyc differ diff --git a/src/tools/__pycache__/semantic_db.cpython-310.pyc b/src/tools/__pycache__/semantic_db.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1896ece77e7d1a8a72e22bd2648f4d1c0a024818 Binary files /dev/null and b/src/tools/__pycache__/semantic_db.cpython-310.pyc differ diff --git a/src/tools/__pycache__/wiki.cpython-310.pyc b/src/tools/__pycache__/wiki.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..460041e8e2c5e3a03d2a535cc2d4320300cd18c0 Binary files /dev/null and b/src/tools/__pycache__/wiki.cpython-310.pyc differ diff --git a/src/tools/llm_tools.py b/src/tools/llm_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..9a159c5755aeecb464e4141fb79fa104bbd12715 --- /dev/null +++ b/src/tools/llm_tools.py @@ -0,0 +1,230 @@ + +import json +import string + +import wikipedia +from langchain import PromptTemplate +from langchain.vectorstores import Chroma +from langchain.text_splitter import CharacterTextSplitter + +from src.tools.llms import openai_llm +from src.tools.wiki import Wiki + + + + +def get_wikilist(task: {}) -> str: + """ + get the titles of wiki pages interesting for solving the given task + """ + + llm = openai_llm + template = (f"\n" + f" Your task consists in finding the list of wikipedia page titles which provide useful content " + f" for a paragraph whose description is delimited by triple backticks: ```{task['description']}```\n" + f" \n" + f" The paragraph belongs at the top level of the hierarchy to a document" + f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" + f" Make sure that the paragraph relates the top level of the document\n" + f" \n" + f" The paragraph belongs to a higher paragraph in the hierarchy \\n" + f" whose description is delimited by triple backticks: ``` {task['above']}```\n" + f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" + f" \n" + f" The paragraphs comes after previous paragraphs \\n" + f" whose description is delimited by triple backticks: ``` {task['before']}```\n" + f" Make sure that the paragraph relates with previous paragraph without any repetition\n" + f" \n" + f" The paragraphs comes before next paragraphs \\n" + f" whose description is delimited by triple backticks: ``` {task['after']}```\n" + f" \n" + f" Format your response as a JSON list of strings separated by commas.\n" + f" \n" + f"\n" + f" ") + + prompt = PromptTemplate( + input_variables=[], + template=template + ) + + #wikilist = LLMChain(llm=openai_llm, prompt=prompt).run() + llm_list = llm(template) + wikilist = extract_list(llm_list) + + expanded_wikilist = [] + + expand_factor = 2 + + for wikipage in wikilist: + expanded_wikilist += wikipedia.search(wikipage, expand_factor) + + wikilist = list(set(expanded_wikilist)) + + return wikilist + + +def extract_list(llm_list: str): + print(llm_list) + + def filter_(el: str): + resp = 2 < len(el) + usable_length = len([c for c in el if c in string.ascii_letters]) + resp = resp and len(el)*3/4 < usable_length + return resp + + try: + wikilist = llm_list[1:-1].split('"') + wikilist = [el for el in wikilist if filter_(el)] + print(wikilist) + except: + wikilist = [] + print('issues with the wikilist') + return wikilist + + +def get_public_paragraph(task: {}) -> str: + """returns the task directly performed by chat GPT""" + + llm = openai_llm + template = (f"\n" + f" Your task consists in generating a paragraph\\n" + f" whose description is delimited by triple backticks: ```{task['description']}```\n" + f"\n" + f" The paragraph belongs at the top level of the hierarchy to a document \\n" + f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" + f" Make sure that the paragraph relates the top level of the document\n" + f" \n" + f" The paragraph belongs to a higher paragraph in the hierarchy \\n" + f" whose description is delimited by triple backticks: ``` {task['above']}```\n" + f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" + f" \n" + f" The paragraphs comes after previous paragraphs \\n" + f" whose description is delimited by triple backticks: ``` {task['before']}```\n" + f" Make sure that the paragraph relates with previous paragraph without any repetition\n" + f" \n" + f" The paragraphs comes before next paragraphs \\n" + f" whose description is delimited by triple backticks: ``` {task['after']}```\n" + f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" + f" \n" + f" \n" + f"\n" + f" ") + + p = llm(template) + + return p + + +def create_index(wikilist: [str]): + """ + useful for creating the index of wikipages + """ + fetch = Wiki().fetch + + pages = [(title, fetch(title)) for title in wikilist if type(fetch(title)) != str] + texts = [] + chunk = 800 + for title, page in pages: + texts.append(WikiPage(title=title, fulltext=page.page_content)) + + doc_splitter = CharacterTextSplitter( + separator=".", + chunk_size=chunk, + chunk_overlap=100, + length_function=len, + ) + + paragraphs = texts[0].get_paragraphs(chunk=800) + + split_texts = [] + for p in paragraphs: + split_texts += doc_splitter.split_text(p) + + for split_text in split_texts: + assert type(split_text) == str + assert 0 < len(split_text) < 2 * 500 + + wiki_index = Chroma.from_texts(split_texts) + + return wiki_index + + +def get_wiki_paragraph(wiki_index, task: {}) -> str: + """useful to get a summary in one line from wiki index""" + + task_description = get_public_paragraph(task) + wiki_paragraphs = semantic_search(wiki_index, task_description) + text_content = "" + for p in wiki_paragraphs: + text_content += p.page_content + "/n/n" + + template = (f"\n" + f" Your task consists in generating a paragraph\\n" + f" whose description is delimited by triple backticks: ```{task['description']}```\n" + f"\n" + f" The text generation is based in the documents provided in these sections \n" + f" delimited by by triple backticks: ``` {text_content}``` \n" + f" The paragraph belongs at the top level of the hierarchy to a document \\n" + f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" + f" Make sure that the paragraph relates the top level of the document\n" + f" \n" + f" The paragraph belongs to a higher paragraph in the hierarchy \\n" + f" whose description is delimited by triple backticks: ``` {task['above']}```\n" + f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" + f" \n" + f" The paragraphs comes after previous paragraphs \\n" + f" whose description is delimited by triple backticks: ``` {task['before']}```\n" + f" Make sure that the paragraph relates with previous paragraph without any repetition\n" + f" \n" + f" The paragraphs comes before next paragraphs \\n" + f" whose description is delimited by triple backticks: ``` {task['after']}```\n" + f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" + f" \n" + f" \n" + f"\n" + f" ") + + llm = openai_llm + p = llm(template) + + return p + + +def get_private_paragraph(texts, task: {}) -> str: + """useful to get a summary in one line from wiki index""" + + text_content = "" + for t in texts: + text_content += t + "/n/n" + + template = (f"\n" + f" Your task consists in generating a paragraph\\n" + f" whose description is delimited by triple backticks: ```{task['description']}```\n" + f"\n" + f" The text generation is based in the documents provided in these sections \n" + f" delimited by by triple backticks: ``` {text_content}``` \n" + f" The paragraph belongs at the top level of the hierarchy to a document \\n" + f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" + f" Make sure that the paragraph relates the top level of the document\n" + f" \n" + f" The paragraph belongs to a higher paragraph in the hierarchy \\n" + f" whose description is delimited by triple backticks: ``` {task['above']}```\n" + f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" + f" \n" + f" The paragraphs comes after previous paragraphs \\n" + f" whose description is delimited by triple backticks: ``` {task['before']}```\n" + f" Make sure that the paragraph relates with previous paragraph without any repetition\n" + f" \n" + f" The paragraphs comes before next paragraphs \\n" + f" whose description is delimited by triple backticks: ``` {task['after']}```\n" + f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" + f" \n" + f" \n" + f"\n" + f" ") + + llm = openai_llm + p = llm(template) + + return p diff --git a/src/tools/llms.py b/src/tools/llms.py new file mode 100644 index 0000000000000000000000000000000000000000..2986fac41ade4484c88a50e68d12d80035e2b024 --- /dev/null +++ b/src/tools/llms.py @@ -0,0 +1,20 @@ + +from langchain.llms import OpenAI + +import os + + +OpenAI_KEY = "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" +os.environ["OPENAI_API_KEY"] = OpenAI_KEY +openai_llm = OpenAI(temperature=0) + +SERPAPI_API_KEY = "dba90c4ecfa942f37e2b9eb2e7c6600ef7fb5c02ab8bbfacef426773df14c06b" +os.environ["SERPAPI_API_KEY"] = SERPAPI_API_KEY + + +""" +HF_API_KEY = "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" +hf_llm = HuggingFaceHub(repo_id="google/flan-t5-small", + model_kwargs={"temperature": 0, "max_length": 1000}, + huggingfacehub_api_token=HF_API_KEY) +""" diff --git a/src/tools/semantic_db.py b/src/tools/semantic_db.py new file mode 100644 index 0000000000000000000000000000000000000000..8354c653c42631b5cfb5301765c67c7053de0d3f --- /dev/null +++ b/src/tools/semantic_db.py @@ -0,0 +1,70 @@ +import chromadb +from datetime import datetime + +chroma_client = chromadb.Client() + + +def get_or_create_collection(coll_name: str): + date = coll_name[:6] + coll = chroma_client.get_or_create_collection(name=coll_name, metadata={"date": date}) + return coll + + +def get_collection(coll_name: str): + coll = chroma_client.get_collection(name=coll_name) + return coll + + +def reset_collection(coll_name: str): + coll = chroma_client.get_collection(name=coll_name) + coll.delete() + return coll + + +def delete_old_collections(old=2): + collections = chroma_client.list_collections() + current_hour = int(datetime.now().strftime("%m%d%H")) + + for coll in collections: + coll_hour = int(coll.metadata['date']) + if coll_hour < current_hour - old: + chroma_client.delete_collection(coll.name) + + +def add_texts_to_collection(coll_name: str, texts: [str], file: str, source: str): + """ + add texts to a collection : texts originate all from the same file + """ + coll = chroma_client.get_collection(name=coll_name) + filenames = [{file: 1, 'source': source} for _ in texts] + ids = [file+'-'+str(i) for i in range(len(texts))] + try: + coll.delete(ids=ids) + coll.add(documents=texts, metadatas=filenames, ids=ids) + except: + print(f"exception raised for collection :{coll_name}, texts: {texts} from file {file} and source {source}") + + +def delete_collection(coll_name: str): + chroma_client.delete_collection(name=coll_name) + + +def list_collections(): + return chroma_client.list_collections() + + +def query_collection(coll_name: str, query: str, from_files: [str], n_results: int = 4): + assert 0 < len(from_files) + coll = chroma_client.get_collection(name=coll_name) + where_ = [{file: 1} for file in from_files] + where_ = where_[0] if len(where_) == 1 else {'$or': where_} + n_results_ = min(n_results, coll.count()) + + ans = "" + try: + ans = coll.query(query_texts=query, n_results=n_results_, where=where_) + except: + print(f"exception raised at query collection for collection {coll_name} and query {query} from files " + f"{from_files}") + + return ans diff --git a/src/tools/wiki.py b/src/tools/wiki.py new file mode 100644 index 0000000000000000000000000000000000000000..6022dc5ab7a8a0381706af790159404592f7f183 --- /dev/null +++ b/src/tools/wiki.py @@ -0,0 +1,61 @@ +from typing import Union + +from langchain.docstore.base import Docstore +from langchain.docstore.document import Document + + + +class Wiki(Docstore): + """ + Wrapper around wikipedia API. + """ + + def __init__(self) -> None: + """Check that wikipedia package is installed.""" + try: + import wikipedia # noqa: F401 + except ImportError: + raise ValueError( + "Could not import wikipedia python package. " + "Please install it with `pip install wikipedia`." + ) + + @staticmethod + def fetch(searched_page: str) -> Union[str, Document]: + """ + Try to fetch for wiki page. + + If page exists, return the page summary, and a PageWithLookups object. + If page does not exist, return similar entries. + """ + import wikipedia + + try: + # wikipedia.set_lang("fr") + page_content = wikipedia.page(searched_page).content + url = wikipedia.page(searched_page).url + result: Union[str, Document] = Document( + page_content=page_content, metadata={"page": url} + ) + except wikipedia.PageError: + result = f"Could not find [{searched_page}]. Similar: {wikipedia.search(searched_page)}" + + except wikipedia.DisambiguationError: + result = f"Could not find [{searched_page}]. Similar: {wikipedia.search(searched_page)}" + return result + + def search(searched_context: str) -> [str]: + """ + Finds wiki page title in relation with the given context + """ + import wikipedia + + try: + # wikipedia.set_lang("fr") + page_title_list = wikipedia.search(searched_context) + result = page_title_list + except wikipedia.PageError: + result = f"Could not find [{searched_context}]." + return result + + diff --git a/src/view/__pycache__/log_msg.cpython-310.pyc b/src/view/__pycache__/log_msg.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0752f6c0ab34d6da820d908ad1caae1c410866e4 Binary files /dev/null and b/src/view/__pycache__/log_msg.cpython-310.pyc differ diff --git a/src/view/__pycache__/view.cpython-310.pyc b/src/view/__pycache__/view.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7172d558a26cf38afc73f2c87299d428583318fc Binary files /dev/null and b/src/view/__pycache__/view.cpython-310.pyc differ diff --git a/src/view/log_msg.py b/src/view/log_msg.py new file mode 100644 index 0000000000000000000000000000000000000000..75d687d792af8f7e4c5fe7f1b1204457ea6e8095 --- /dev/null +++ b/src/view/log_msg.py @@ -0,0 +1,41 @@ +from typing import Dict + +from config import config + + +def create_msg_from(logs: [Dict], docs) -> str: + log_messages = [] + log_msg = config['log_msg'] + docs_seen = [] + msg = '' + for doc in docs: + for log in logs: + if doc.name in log.keys(): + log = log[doc.name] + if 'suppressed_styles' in log.keys(): + if log['suppressed_styles']: + msg = log_msg['suppressed_styles'] + for style_name in log['suppressed_styles']: + msg += " - " + style_name + "\n" + if log['modified_styles']: + msg += log_msg['modified_styles'] + for style, log_s in log['modified_styles']: + msg += log_msg['modified_style'] + style + "\n" + for modif, _ in log_s: + msg += log_msg[modif] + ' ' + msg += '\n' + if log['added_styles']: + msg += log_msg['added_styles'] + for style_name in log['added_styles']: + msg += " - " + style_name + "\n" + if 'style_mapping' in log.keys(): + msg = log['style_mapping'] + if msg: + if doc not in docs_seen: + msg = log_msg['document'] + doc.name + '\n' + msg + docs_seen.append(doc) + log_messages.append(msg) + msg = '' + log_messages_str = '\n'.join(log_messages) + return log_messages_str + diff --git a/src/view/style_components.py b/src/view/style_components.py new file mode 100644 index 0000000000000000000000000000000000000000..e6225bb3206af3d3e27c42ee832d782f54492824 --- /dev/null +++ b/src/view/style_components.py @@ -0,0 +1,11 @@ +import gradio as gr + + +import config + + +def input_files_fn1(input_files_): + update_ = { + output_files_comp: gr.update(visible=True) + } if input_files_ else {} + return update_ \ No newline at end of file diff --git a/src/view/test_view.py b/src/view/test_view.py new file mode 100644 index 0000000000000000000000000000000000000000..cb845ba4c6f6ef2bd66c3e988014808155e1ef45 --- /dev/null +++ b/src/view/test_view.py @@ -0,0 +1,35 @@ +import gradio as gr +import random + +with gr.Blocks() as test: + list_2 = ["choix21", "choix 22", "et choix 23"] + with gr.Row(): + with gr.Accordion("See Details") as grac: + gr.Markdown("lorem ipsum") + hide_btn = gr.Button("hide") + show_btn = gr.Button("show") + + def hide_fn(): + update_ = { + grac: gr.update(open=False) + } + return update_ + + def show_fn(): + update_ = { + grac: gr.update(open=True) + } + return update_ + + hide_btn.click(hide_fn, + inputs=[], + outputs=[grac]) + show_btn.click(show_fn, + inputs=[], + outputs=[grac]) + + + + +if __name__ == "__main__": + test.launch() diff --git a/src/view/view.py b/src/view/view.py new file mode 100644 index 0000000000000000000000000000000000000000..03de7f0424c0eada6817171d052d9991c91b7c00 --- /dev/null +++ b/src/view/view.py @@ -0,0 +1,366 @@ +import gradio as gr +from typing import Dict +import asyncio + +from src.control.control import Controller + + +def run(config: Dict, controller: Controller): + with gr.Blocks() as formatdoc: + with gr.Row(): + with gr.Column(): + pass + + with gr.Column(scale=10): + """ + ===================================================== + Input and style components + ========================== + """ + input_files_comp = gr.File(file_count="multiple", file_types=[".docx"]) + + with gr.Accordion("Modifier automatiquement les styles", open=False) as style_acc: + templates_radio = gr.Radio( + label="Templates", + choices=config['templates'], + value=config['templates'][config['default_template_index']], + ) + options_btn = gr.CheckboxGroup(choices=config['options'], + label="Options", + info="(Fonctions non implémentées actuellement)", + interactive=True, ) + + with gr.Accordion("Mapper les styles qui n'existent pas dans le template", open=False) \ + as newstyles_acc: + with gr.Column(scale=2): + newstyle_comps = [gr.Dropdown(visible=False, interactive=True) + for _ in range(config['max_styles'])] + + log_comp = gr.Textbox(label="Journal des modifications", visible=False) + + output_styles_files_comp = gr.File(file_count="multiple", file_types=[".docx"], visible=False) + + with gr.Row(): + run_style_btn = gr.Button("Appliquer le template et les modifications de style") + clear_style_btn = gr.Button("Annuler les modifications de style") + + """ + =============================================== + Generation components + ====================== + """ + with gr.Accordion("Générer automatiquement une première version du document", open=False) as gen_acc: + + generate_option_btn = gr.Radio( + label="Automatically generate a draft based on your own database", + choices=["Auto generation", "No generation"], + value="No generation", + interactive=True, + visible=False, + ) + + db_list_comp = gr.CheckboxGroup( + label="Base de connaissance", + info="Ces documents constituent la source de référence. Désélectionner pour qu'ils ne soient " + "pas pris en compte lors de la génération automatique", + visible=True, + interactive=True, + ) + db_reset_btn = gr.Button("Effacer la base de connaissance", visible=False) \ + .style(full_width=False, size="sm") + with gr.Accordion("Ajouter des documents dans la base de connaissance", open=False): + with gr.Column(visible=True, variant="panel") as add_col: + with gr.Tab("Depuis Wikipedia"): + wiki_fetch_btn = gr.Button("Rechercher les pages Wikipedia", visible=True) + wiki_fetch_btn.style(full_width=False, size="sm") + wiki_list_comp = gr.CheckboxGroup( + label="Sélectionner les pages à ajouter dans la base de connaissance", + visible=False, + interactive=True, + ) + wiki_add_to_db_btn = \ + gr.Button("Ajouter les documents sélectionnés à la base de connaissance", + visible=False) + wiki_add_to_db_btn.style(full_width=False, size="sm") + + with gr.Tab("Depuis le disque local"): + my_files_list_comp = gr.Files( + label="Charger ses documents", + info="Les documents fournissent le contexte utilisé pour la génération de texte", + visible=True, + ) + my_files_add_to_db_btn = gr.Button("Add files to sources", visible=False) + my_files_add_to_db_btn.style(full_width=False, size="sm") + + add_close_btn = gr.Button("Close", visible=False).style(size='sm', full_width=False) + with gr.Row(): + db_add_doc_btn = gr.Button("Ajouter de nouveaux documents", visible=False)\ + .style(full_width=False, size="sm") + + output_files_comp = gr.Files(file_count="multiple", visible=False) + + generate_btn = gr.Button("Générer", interactive=True) + + clear_btn = gr.Button('Nettoyer', visible=False) + rerun_btn = gr.Button('Relancer', visible=False) + + with gr.Column(): + pass + + """ + =================================================== + state variables + =============== + """ + + wiki_source_var: [str] = gr.State([]) # list of wikipage titles of interest for the input text tasks + wiki_db_var: [str] = gr.State([]) # list of wiki document titles in the db (as seen from the UI) + my_files_db_var: [str] = gr.State([]) # list of titles of the files uploaded in the db (as seen from the UI) + db_collection_var: str = gr.State("-1") # name of the collection of documents sources in the db + + """ + =================================================== + Input and styles functions and listeners + ======================================== + """ + + def input_files_upload_fn(input_files_): + controller.copy_docs(input_files_) + update_ = { + newstyles_acc: gr.update(open=False), + style_acc: gr.update(open=False), + run_style_btn: gr.update(visible=True), + } + newstyles_update = newstyles_fn() + update_.update(newstyles_update) + return update_ + + input_files_comp.upload(input_files_upload_fn, + inputs=[input_files_comp], + outputs=[style_acc, newstyles_acc, run_style_btn] + newstyle_comps + ) + + def input_file_clear_fn(): + controller.clear_docs() + update_ = { + options_btn: gr.update(value=[]), + log_comp: gr.update(value="", visible=False), + output_styles_files_comp: gr.update(value=[], visible=False), + newstyles_acc: gr.update(open=False), + style_acc: gr.update(open=False), + gen_acc: gr.update(open=False), + output_files_comp: gr.update(visible=False), + } + newstyles_update_ = newstyles_reset() + update_.update(newstyles_update_) + return update_ + + input_files_comp.clear( + input_file_clear_fn, + inputs=[], + outputs=[options_btn, output_styles_files_comp, output_files_comp, log_comp, newstyles_acc, + gen_acc, style_acc] + newstyle_comps + ) + + def newstyles_fn(): + different_styles, template_styles = controller.get_difference_with_template() + update_ = {} + get_label = lambda i: f"document: {different_styles[i]['doc'].name} style: {different_styles[i]['style']}" + newstyles_update_ = { + newstyle_comps[i]: gr.update(visible=i < len(different_styles), + choices=template_styles, + value=None, + label=get_label(i)) if i < len(different_styles) else '' + for i in range(config['max_styles']) + } + update_.update(newstyles_update_) + return update_ + + def newstyles_reset(): + update_ = { + newstyle_comps[i]: gr.update(visible=False, + choices=[], + label='') + for i in range(config['max_styles']) + } + return update_ + + def templates_fn(templates_): + controller.set_template(templates_) + update_ = newstyles_fn() + return update_ + + templates_radio.change(templates_fn, + inputs=[templates_radio], + outputs=newstyle_comps) + + def newstyle_fns(src_index: int): + def newstyle_fn(newstyle_): + controller.update_style(src_index, newstyle_) + return newstyle_fn + + for src_index, newstyle_comp in enumerate(newstyle_comps): + newstyle_comp.input(newstyle_fns(src_index), inputs=[newstyle_comp], outputs=[]) + + def clear_style_fn(input_files_): + controller.clear_docs() + if input_files_: + controller.copy_docs(input_files_) + controller.set_template() + update_ = { + options_btn: gr.update(value=[]), + log_comp: gr.update(value="", visible=False), + output_styles_files_comp: gr.update(value=[], visible=False), + newstyles_acc: gr.update(open=False), + run_style_btn: gr.update(visible=True), + } + newstyles_update_ = newstyles_fn() + update_.update(newstyles_update_) + return update_ + + clear_style_btn.click(clear_style_fn, + inputs=[input_files_comp], + outputs=[options_btn, output_styles_files_comp, log_comp, newstyles_acc, run_style_btn] + + newstyle_comps + ) + + def run_style_fn(): + controller.apply_template(add_front_pages=False) + log = controller.get_log() + output_paths = [nd.path for nd in controller.new_docs] + update_ = { + log_comp: gr.update(value=log, visible=True), + output_styles_files_comp: gr.update(value=output_paths, visible=True), + run_style_btn: gr.update(visible=False), + } + return update_ + + run_style_btn.click(run_style_fn, + inputs=[], + outputs=[log_comp, output_styles_files_comp, run_style_btn]) + + """ + ===================================================== + Generation functions + ==================== + """ + + def generate_option_fn(db_collection_): + id_ = controller.get_or_create_collection(db_collection_) + update_ = { + db_collection_var: id_ + } + return update_ + + def wiki_fetch1_fn(): + """ + fetch the wikifiles interesting for solving the tasks as defined in the input doc + """ + update_ = { + wiki_list_comp: gr.update(visible=True), + } + return update_ + + def wiki_fetch2_fn(): + """ + fetch the wikifiles interesting for solving the tasks as defined in the input doc + """ + wiki_interesting_files = controller.wiki_fetch() + wiki_files = wiki_interesting_files # [w for w in wiki_interesting_files if w not in wiki_db_files_] + update_ = { + wiki_list_comp: gr.update(visible=True, value=wiki_files, choices=wiki_files), + wiki_source_var: wiki_interesting_files, + wiki_add_to_db_btn: gr.update(visible=True), + } + return update_ + + async def wiki_add_to_db_fn(wiki_list_, wiki_source_, wiki_db_, db_list_, db_collection_): + """ + adds the wikipages to the db source + """ + wiki_to_add = [wiki for wiki in wiki_list_ if wiki not in wiki_db_] + db_list_ += wiki_to_add + wiki_db_ += wiki_to_add + wiki_source_remaining = [wiki for wiki in wiki_source_ if wiki not in wiki_db_] + tasks = [controller.wiki_upload_and_store(wiki, db_collection_) for wiki in wiki_to_add] + await asyncio.gather(*tasks) + db_not_empty = 0 < len(db_list_) + wiki_to_add_not_empty = 0 < len(wiki_source_remaining) + update_ = { + wiki_db_var: wiki_db_, + wiki_list_comp: gr.update(value=wiki_source_remaining, choices=wiki_source_remaining), + wiki_add_to_db_btn: gr.update(visible=wiki_to_add_not_empty), + db_list_comp: gr.update( + visible=True, + value=db_list_, + choices=db_list_, + label="Database content"), + db_reset_btn: gr.update(visible=db_not_empty), + generate_btn: gr.update(visible=True, interactive=db_not_empty), + } + return update_ + + def generate_fn1(): + update_ = { + output_files_comp: gr.update(visible=True) + } + return update_ + + async def generate_fn2(db_collection_, db_list_): + output_files = controller.generate_doc_from_db(collection_name=db_collection_, + from_files=db_list_) + update_ = { + output_files_comp: gr.update(value=output_files, visible=True), + } + return update_ + + + """ + ===================================================== + Generation listeners + ==================== + """ + + wiki_fetch_btn \ + .click(wiki_fetch1_fn, inputs=[], outputs=[wiki_list_comp]) \ + .then(wiki_fetch2_fn, + inputs=[], + outputs=[wiki_list_comp, wiki_source_var, wiki_add_to_db_btn]) + + wiki_add_to_db_btn\ + .click(generate_option_fn, + inputs=[db_collection_var], + outputs=[db_collection_var])\ + .then(wiki_add_to_db_fn, + inputs=[wiki_list_comp, wiki_source_var, wiki_db_var, db_list_comp, db_collection_var], + outputs=[db_list_comp, wiki_list_comp, wiki_db_var, + generate_btn, wiki_add_to_db_btn, db_reset_btn]) + + generate_btn\ + .click(generate_fn1, + inputs=[], + outputs=[output_files_comp])\ + .then(generate_fn2, + inputs=[db_collection_var, db_list_comp], + outputs=[output_files_comp]) + + """ + ===================================================== + Clear and rerun functions and listeners + ======================================= + """ + + def clear_fn(): + update_ = { + input_files_comp: gr.update(value=None), + output_files_comp: gr.update(value=None, visible=False), + clear_btn: gr.update(visible=False), + rerun_btn: gr.update(visible=False), + } + return update_ + + clear_btn.click(clear_fn, + inputs=[], + outputs=[input_files_comp, output_files_comp, clear_btn, rerun_btn]) + + return formatdoc diff --git a/test_app.py b/test_app.py new file mode 100644 index 0000000000000000000000000000000000000000..2b2d7ab2d078c4a767e956dd442b17ecb9ad1eae --- /dev/null +++ b/test_app.py @@ -0,0 +1,67 @@ +import docx +from docx.enum.style import WD_STYLE_TYPE +import os +from config import config +from typing import Dict +import random +import datetime +import string + +from lxml import etree + +from src.domain.doc import Doc + + + + +name = 'CorpTemplate.docx' + +template_path = config['templates_path'] + '/' + config['templates'][config['default_template_index']] +template = Doc(template_path) +doc_path = config['these_docs_path'] + name +this_doc = Doc(path=doc_path) +new_doc_path = config['new_docs_path'] + this_doc.name + '_.docx' +new_doc = this_doc.copy(new_doc_path) + + + + +new_styles = new_doc.styles.xstyles +print(etree.tostring(new_styles['.Titre1'].element)) +names = new_doc.styles.names +print(names) +new_doc.save_as_docx() + + +s = template.styles.xstyles['.BodyText'] +# new_styles.add_style(s.name, WD_STYLE_TYPE.PARAGRAPH) + + +list_styles = [(s, s.name) for s in template.styles.xstyles if s.type==WD_STYLE_TYPE.LIST] + + +base_styles_set = set() +for s in new_styles: + if s.type == 1: + if s.base_style: + try: + base_styles_set.add(s.base_style.name) + except: + print(f"failure for {s}") + + +base_styles = list(base_styles_set) + + + + +""" +or p in new_doc.xdoc.paragraphs: + if p.style == new_styles['_newBody__2']: + p.style = s.name + +new_styles['_newBody__2'].delete() +new_doc.save_as_docx() +""" +pass +etree.tostring(list_styles[1][0].element) \ No newline at end of file