adrien.aribaut-gaudin
commited on
Commit
·
498db6b
1
Parent(s):
47ca6bf
feat: new public GenProp
Browse files- .gitattributes +1 -0
- .gitignore +6 -0
- app.py +14 -0
- config.py +28 -0
- data/doc.xml +46 -0
- data/templates/Template_presentation.docx +3 -0
- requirements.txt +0 -0
- src/control/controller.py +285 -0
- src/domain/block.py +71 -0
- src/domain/container.py +219 -0
- src/domain/container_requirements.py +140 -0
- src/domain/doc.py +473 -0
- src/domain/paragraph.py +140 -0
- src/domain/requirements_paragraphs.py +41 -0
- src/domain/styles.py +164 -0
- src/domain/wikidoc.py +128 -0
- src/llm/llm_tools.py +337 -0
- src/llm/llms.py +15 -0
- src/model/block.py +49 -0
- src/model/container.py +143 -0
- src/model/doc.py +54 -0
- src/model/paragraph.py +50 -0
- src/reader/reader_for_requirements.py +143 -0
- src/retriever/retriever.py +198 -0
- src/tools/doc_tools.py +73 -0
- src/tools/index_creation.py +72 -0
- src/tools/list_tool.py +17 -0
- src/tools/paragraph_tools.py +45 -0
- src/tools/pretty_print.py +12 -0
- src/tools/semantic_db.py +70 -0
- src/tools/wiki.py +61 -0
- src/view/log_msg.py +47 -0
- src/view/style_components.py +9 -0
- src/view/test_view.py +34 -0
- src/view/view.py +533 -0
- temp/generated_files/file.txt +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.docx filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
venv1
|
3 |
+
test/files_to_test/*
|
4 |
+
config_key.py
|
5 |
+
test
|
6 |
+
.env
|
app.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from config import config
|
2 |
+
from src.control.controller import Controller
|
3 |
+
import src.view.view as view
|
4 |
+
import chromadb
|
5 |
+
from src.retriever.retriever import Retriever
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
client_db = chromadb.Client()
|
10 |
+
|
11 |
+
ctrl = Controller(config, client_db, retriever=Retriever())
|
12 |
+
app = view.run(controller=ctrl, config=config)
|
13 |
+
|
14 |
+
app.queue().launch()
|
config.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
config = {
|
4 |
+
'templates_path': 'data/templates',
|
5 |
+
'these_docs_path': 'data/examples/',
|
6 |
+
'new_docs_path': 'data/examples/',
|
7 |
+
'default_template_index': 0,
|
8 |
+
'styled_docs_path': 'temp/styles_files',
|
9 |
+
'generated_docs_path': 'temp/generated_files',
|
10 |
+
'options': ["Recentrer les tableaux", "Justifier le texte (Normal)"],
|
11 |
+
'max_styles': 300,
|
12 |
+
'log_msg': {
|
13 |
+
'options_applied': 'Les options suivantes ont été appliquées : \n',
|
14 |
+
'suppressed_styles': 'Les styles suivants ont été supprimés : \n',
|
15 |
+
'modified_styles': 'Les styles suivants ont été modifiés : \n',
|
16 |
+
'added_styles': 'Les styles suivants ont été ajoutés :\n',
|
17 |
+
'modified_style': ' - ',
|
18 |
+
'color': ' la couleur,',
|
19 |
+
'font size': ' la taille de la fonte,',
|
20 |
+
'font': ' la fonte,',
|
21 |
+
'all_caps': ' les majuscules,',
|
22 |
+
'bold': 'le caractère gras',
|
23 |
+
'document': '\n============================\n Sur le document : ',
|
24 |
+
},
|
25 |
+
}
|
26 |
+
|
27 |
+
templates = [t for t in os.listdir(config['templates_path']) if t.endswith((".docx"))]
|
28 |
+
config.update({'templates': templates})
|
data/doc.xml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh wp14">
|
2 |
+
<w:body>
|
3 |
+
<w:tbl>
|
4 |
+
<w:tblPr>
|
5 |
+
<w:tblpPr w:leftFromText="141" w:rightFromText="141" w:vertAnchor="page" w:tblpY="3001"/>
|
6 |
+
<w:tblOverlap w:val="never"/>
|
7 |
+
<w:tblW w:w="7597" w:type="dxa"/>
|
8 |
+
<w:tblLayout w:type="fixed"/>
|
9 |
+
<w:tblCellMar><w:left w:w="0" w:type="dxa"/>
|
10 |
+
<w:right w:w="0" w:type="dxa"/>
|
11 |
+
</w:tblCellMar><w:tblLook w:val="0000" w:firstRow="0" w:lastRow="0" w:firstColumn="0" w:lastColumn="0" w:noHBand="0" w:noVBand="0"/>
|
12 |
+
</w:tblPr>
|
13 |
+
<w:tblGrid>
|
14 |
+
<w:gridCol w:w="7597"/>
|
15 |
+
</w:tblGrid>
|
16 |
+
<w:tr w:rsidR="008F20A4" w:rsidRPr="00C335CE" w14:paraId="58F81C33" w14:textId="77777777" w:rsidTr="00C4517C">
|
17 |
+
<w:trPr><w:cantSplit/>
|
18 |
+
<w:trHeight w:hRule="exact" w:val="397"/>
|
19 |
+
</w:trPr><w:tc>
|
20 |
+
<w:tcPr>
|
21 |
+
<w:tcW w:w="7597" w:type="dxa"/>
|
22 |
+
<w:shd w:val="clear" w:color="auto" w:fill="auto"/>
|
23 |
+
</w:tcPr><w:p w14:paraId="18CBAEA8" w14:textId="77777777" w:rsidR="008F20A4" w:rsidRPr="00C335CE" w:rsidRDefault="008F20A4" w:rsidP="00C4517C">
|
24 |
+
<w:pPr><w:pStyle w:val="BodyText"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Réponse à</w:t></w:r></w:p></w:tc>
|
25 |
+
</w:tr><w:tr w:rsidR="008F20A4" w:rsidRPr="00C335CE" w14:paraId="6B21A2A2" w14:textId="77777777" w:rsidTr="00C4517C">
|
26 |
+
<w:trPr><w:cantSplit/>
|
27 |
+
<w:trHeight w:hRule="exact" w:val="851"/>
|
28 |
+
</w:trPr><w:tc><w:tcPr><w:tcW w:w="7597" w:type="dxa"/>
|
29 |
+
<w:shd w:val="clear" w:color="auto" w:fill="auto"/><w:vAlign w:val="bottom"/></w:tcPr>
|
30 |
+
<w:p w14:paraId="415EC112" w14:textId="77777777" w:rsidR="008F20A4" w:rsidRPr="00C335CE" w:rsidRDefault="008F20A4" w:rsidP="00DE680A">
|
31 |
+
<w:pPr><w:pStyle w:val="CompanyName"/>
|
32 |
+
<w:framePr w:hSpace="0" w:wrap="auto" w:vAnchor="margin" w:hAnchor="text" w:xAlign="left" w:yAlign="inline"/>
|
33 |
+
<w:suppressOverlap w:val="0"/></w:pPr><w:r w:rsidRPr="00C335CE">
|
34 |
+
<w:t>Nom du Client</w:t>
|
35 |
+
</w:r>
|
36 |
+
</w:p>
|
37 |
+
</w:tc>
|
38 |
+
</w:tr>
|
39 |
+
<w:tr w:rsidR="008F20A4" w:rsidRPr="00C335CE" w14:paraId="4CECECE0" w14:textId="77777777" w:rsidTr="00C4517C">
|
40 |
+
<w:trPr><w:cantSplit/><w:trHeight w:hRule="exact" w:val="397"/>
|
41 |
+
</w:trPr>
|
42 |
+
<w:tc>
|
43 |
+
<w:tcPr>
|
44 |
+
<w:tcW w:w="7597" w:type="dxa"/>
|
45 |
+
<w:shd w:val="clear" w:color="auto" w:fill="auto"/>
|
46 |
+
</w:tcPr><w:p w14:paraId="04690B8E" w14:textId="77777777" w:rsidR="008F20A4" w:rsidRPr="00C335CE" w:rsidRDefault="008F20A4" w:rsidP="00C4517C"><w:pPr><w:pStyle w:val="BodyText"/></w:pPr><w:proofErr w:type="gramStart"/><w:r w:rsidRPr="00C335CE"><w:t>pour</w:t></w:r><w:proofErr w:type="gramEnd"/><w:r w:rsidRPr="00C335CE"><w:t xml:space="preserve"> le</w:t></w:r></w:p></w:tc></w:tr><w:tr w:rsidR="008F20A4" w:rsidRPr="00C335CE" w14:paraId="10E37A3B" w14:textId="77777777" w:rsidTr="00C4517C"><w:trPr><w:cantSplit/><w:trHeight w:hRule="exact" w:val="1871"/></w:trPr><w:tc><w:tcPr><w:tcW w:w="7597" w:type="dxa"/><w:shd w:val="clear" w:color="auto" w:fill="auto"/><w:vAlign w:val="bottom"/></w:tcPr><w:p w14:paraId="3848203F" w14:textId="77777777" w:rsidR="008F20A4" w:rsidRPr="00C335CE" w:rsidRDefault="008F20A4" w:rsidP="00011EBE"><w:pPr><w:pStyle w:val="ProjectNumber"/><w:framePr w:hSpace="0" w:wrap="auto" w:hAnchor="text" w:xAlign="left" w:yAlign="inline"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Style pour cette page seulement (non recopié en en-tête)</w:t></w:r></w:p><w:p w14:paraId="4209E6AE" w14:textId="77777777" w:rsidR="008F20A4" w:rsidRPr="00C335CE" w:rsidRDefault="008F20A4" w:rsidP="00011EBE"><w:pPr><w:pStyle w:val="ProjectName"/><w:framePr w:hSpace="0" w:wrap="auto" w:vAnchor="margin" w:hAnchor="text" w:xAlign="left" w:yAlign="inline"/><w:suppressOverlap w:val="0"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Nom du projet</w:t></w:r></w:p></w:tc></w:tr><w:tr w:rsidR="008F20A4" w:rsidRPr="00C335CE" w14:paraId="5CF53648" w14:textId="77777777" w:rsidTr="00C4517C"><w:trPr><w:cantSplit/><w:trHeight w:hRule="exact" w:val="397"/></w:trPr><w:tc><w:tcPr><w:tcW w:w="7597" w:type="dxa"/><w:shd w:val="clear" w:color="auto" w:fill="auto"/></w:tcPr><w:p w14:paraId="01D4D4D2" w14:textId="77777777" w:rsidR="008F20A4" w:rsidRPr="00C335CE" w:rsidRDefault="008F20A4" w:rsidP="00C4517C"><w:pPr><w:pStyle w:val="BodyText"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Date de remise</w:t></w:r></w:p></w:tc></w:tr><w:tr w:rsidR="008F20A4" w:rsidRPr="00C335CE" w14:paraId="47C4D541" w14:textId="77777777" w:rsidTr="00C4517C"><w:trPr><w:cantSplit/><w:trHeight w:hRule="exact" w:val="397"/></w:trPr><w:tc><w:tcPr><w:tcW w:w="7597" w:type="dxa"/><w:shd w:val="clear" w:color="auto" w:fill="auto"/><w:vAlign w:val="bottom"/></w:tcPr><w:p w14:paraId="463058A0" w14:textId="77777777" w:rsidR="008F20A4" w:rsidRPr="00C335CE" w:rsidRDefault="008F20A4" w:rsidP="00C4517C"><w:pPr><w:pStyle w:val="DateDue"/><w:framePr w:hSpace="0" w:wrap="auto" w:vAnchor="margin" w:hAnchor="text" w:xAlign="left" w:yAlign="inline"/><w:suppressOverlap w:val="0"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>JJ/MM/AAAA</w:t></w:r></w:p></w:tc></w:tr><w:tr w:rsidR="008F20A4" w:rsidRPr="00C335CE" w14:paraId="6452573F" w14:textId="77777777" w:rsidTr="00C4517C"><w:trPr><w:cantSplit/><w:trHeight w:hRule="exact" w:val="340"/></w:trPr><w:tc><w:tcPr><w:tcW w:w="7597" w:type="dxa"/><w:shd w:val="clear" w:color="auto" w:fill="auto"/><w:vAlign w:val="bottom"/></w:tcPr><w:p w14:paraId="6536045D" w14:textId="77777777" w:rsidR="008F20A4" w:rsidRPr="00C335CE" w:rsidRDefault="008F20A4" w:rsidP="00C4517C"><w:pPr><w:pStyle w:val="Classification"/><w:framePr w:hSpace="0" w:wrap="auto" w:vAnchor="margin" w:hAnchor="text" w:xAlign="left" w:yAlign="inline"/><w:suppressOverlap w:val="0"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:rPr><w:sz w:val="18"/></w:rPr><w:t>Strictement confidentiel</w:t></w:r></w:p></w:tc></w:tr></w:tbl><w:p w14:paraId="45EA0891" w14:textId="77777777" w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidRDefault="005A01BC" w:rsidP="00891B8F"><w:pPr><w:pStyle w:val="documentControl"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:rPr><w:noProof/><w:lang w:eastAsia="fr-FR"/></w:rPr><w:drawing><wp:anchor distT="0" distB="0" distL="114300" distR="114300" simplePos="0" relativeHeight="251659264" behindDoc="1" locked="0" layoutInCell="0" allowOverlap="0" wp14:anchorId="4A040FE9" wp14:editId="18E34965"><wp:simplePos x="0" y="0"/><wp:positionH relativeFrom="margin"><wp:align>right</wp:align></wp:positionH><wp:positionV relativeFrom="margin"><wp:align>bottom</wp:align></wp:positionV><wp:extent cx="6768000" cy="6786000"/><wp:effectExtent l="0" t="0" r="0" b="0"/><wp:wrapNone/><wp:docPr id="11" name="Image 11"/><wp:cNvGraphicFramePr><a:graphicFrameLocks xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" noChangeAspect="1"/></wp:cNvGraphicFramePr><a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture"><pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture"><pic:nvPicPr><pic:cNvPr id="0" name="Picture 2" descr="Page garde offre"/><pic:cNvPicPr><a:picLocks noChangeAspect="1" noChangeArrowheads="1"/></pic:cNvPicPr></pic:nvPicPr><pic:blipFill><a:blip r:embed="rId11"><a:extLst><a:ext uri="{28A0092B-C50C-407E-A947-70E740481C1C}"><a14:useLocalDpi xmlns:a14="http://schemas.microsoft.com/office/drawing/2010/main" val="0"/></a:ext></a:extLst></a:blip><a:stretch><a:fillRect/></a:stretch></pic:blipFill><pic:spPr bwMode="auto"><a:xfrm><a:off x="0" y="0"/><a:ext cx="6768000" cy="6786000"/></a:xfrm><a:prstGeom prst="rect"><a:avLst/></a:prstGeom><a:noFill/><a:ln><a:noFill/></a:ln></pic:spPr></pic:pic></a:graphicData></a:graphic><wp14:sizeRelH relativeFrom="page"><wp14:pctWidth>0</wp14:pctWidth></wp14:sizeRelH><wp14:sizeRelV relativeFrom="page"><wp14:pctHeight>0</wp14:pctHeight></wp14:sizeRelV></wp:anchor></w:drawing></w:r><w:r w:rsidRPr="00C335CE"><w:br w:type="page"/></w:r><w:proofErr w:type="gramStart"/><w:r w:rsidR="00B9348B" w:rsidRPr="00C335CE"><w:lastRenderedPageBreak/><w:t>clause</w:t></w:r><w:proofErr w:type="gramEnd"/><w:r w:rsidR="00B9348B" w:rsidRPr="00C335CE"><w:t xml:space="preserve"> de confidentialité</w:t></w:r></w:p><w:p w14:paraId="3E57861B" w14:textId="77777777" w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidRDefault="00115837" w:rsidP="00891B8F"><w:pPr><w:pStyle w:val="ProprietaryNoticeText"/><w:rPr><w:color w:val="595959"/></w:rPr></w:pPr><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t xml:space="preserve">Toute </w:t></w:r><w:r w:rsidR="007132BD" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t xml:space="preserve">information </w:t></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t xml:space="preserve">contenue dans ce </w:t></w:r><w:r w:rsidR="007132BD" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t xml:space="preserve">document </w:t></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t xml:space="preserve">strictement confidentiel est fournie à </w:t></w:r><w:r w:rsidR="007132BD" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:fldChar w:fldCharType="begin"/></w:r><w:r w:rsidR="007132BD" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:instrText xml:space="preserve"> STYLEREF .CompanyName \\* MERGEFORMAT </w:instrText></w:r><w:r w:rsidR="007132BD" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:fldChar w:fldCharType="separate"/></w:r><w:r w:rsidR="000C19BE"><w:rPr><w:noProof/><w:color w:val="595959"/></w:rPr><w:t>Nom du Client</w:t></w:r><w:r w:rsidR="007132BD" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:fldChar w:fldCharType="end"/></w:r><w:r w:rsidR="007132BD" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t>dans le seul but de répondre à ses demandes et ne peut être utilisé</w:t></w:r><w:r w:rsidR="007A159B" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t>e</w:t></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t xml:space="preserve"> à d’autres fins.</w:t></w:r></w:p><w:p w14:paraId="3EC09512" w14:textId="77777777" w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidRDefault="007132BD" w:rsidP="007132BD"><w:pPr><w:pStyle w:val="ProprietaryNoticeText"/><w:rPr><w:color w:val="595959"/></w:rPr></w:pPr><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:fldChar w:fldCharType="begin"/></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:instrText xml:space="preserve"> STYLEREF .CompanyName \\* MERGEFORMAT </w:instrText></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:fldChar w:fldCharType="separate"/></w:r><w:r w:rsidR="000C19BE"><w:rPr><w:noProof/><w:color w:val="595959"/></w:rPr><w:t>Nom du Client</w:t></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:fldChar w:fldCharType="end"/></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidR="00115837" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t>s’engage à ne pas publier ni faire connaître tout ou partie de ces informations à quelque tierce partie que ce soit sans l’autorisation préalable d’</w:t></w:r><w:r w:rsidR="00712552" w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t>Orange</w:t></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t>.</w:t></w:r></w:p><w:p w14:paraId="7D23B684" w14:textId="77777777" w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidRDefault="007132BD" w:rsidP="007132BD"><w:pPr><w:pStyle w:val="ProprietaryNoticeText"/><w:rPr><w:color w:val="595959"/></w:rPr></w:pPr><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t>© copyright 201</w:t></w:r><w:r w:rsidR="00DA1A27"><w:rPr><w:color w:val="595959"/></w:rPr><w:t>8</w:t></w:r></w:p><w:p w14:paraId="5E1DE421" w14:textId="77777777" w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidRDefault="00115837" w:rsidP="00C21D48"><w:pPr><w:pStyle w:val="ProprietaryNoticeText"/><w:spacing w:after="2800"/><w:rPr><w:color w:val="595959"/></w:rPr></w:pPr><w:r w:rsidRPr="00C335CE"><w:rPr><w:color w:val="595959"/></w:rPr><w:t>Tous droits réservés</w:t></w:r></w:p><w:p w14:paraId="3BC5C1B2" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00DA1A27"><w:pPr><w:pStyle w:val="PointsContact"/><w:spacing w:before="6000"/></w:pPr><w:proofErr w:type="gramStart"/><w:r w:rsidRPr="00C335CE"><w:t>votre</w:t></w:r><w:proofErr w:type="gramEnd"/><w:r w:rsidRPr="00C335CE"><w:t xml:space="preserve"> contact</w:t></w:r></w:p><w:tbl><w:tblPr><w:tblW w:w="8505" w:type="dxa"/><w:tblInd w:w="85" w:type="dxa"/><w:tblBorders><w:top w:val="single" w:sz="4" w:space="0" w:color="808080"/><w:left w:val="single" w:sz="4" w:space="0" w:color="808080"/><w:bottom w:val="single" w:sz="4" w:space="0" w:color="808080"/><w:right w:val="single" w:sz="4" w:space="0" w:color="808080"/><w:insideH w:val="single" w:sz="4" w:space="0" w:color="808080"/><w:insideV w:val="single" w:sz="4" w:space="0" w:color="808080"/></w:tblBorders><w:tblLayout w:type="fixed"/><w:tblCellMar><w:left w:w="85" w:type="dxa"/><w:right w:w="85" w:type="dxa"/></w:tblCellMar><w:tblLook w:val="0000" w:firstRow="0" w:lastRow="0" w:firstColumn="0" w:lastColumn="0" w:noHBand="0" w:noVBand="0"/></w:tblPr><w:tblGrid><w:gridCol w:w="1095"/><w:gridCol w:w="3264"/><w:gridCol w:w="900"/><w:gridCol w:w="3246"/></w:tblGrid><w:tr w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w14:paraId="64609664" w14:textId="77777777" w:rsidTr="009564E8"><w:trPr><w:cantSplit/></w:trPr><w:tc><w:tcPr><w:tcW w:w="1095" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="2CA49F3C" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00FF5DD0"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Nom</w:t></w:r><w:r w:rsidR="00FF5DD0" w:rsidRPr="00C335CE"><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidRPr="00C335CE"><w:t>:</w:t></w:r></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="7410" w:type="dxa"/><w:gridSpan w:val="3"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="427D84E8" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00AF69C7"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr></w:p></w:tc></w:tr><w:tr w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w14:paraId="04E8E560" w14:textId="77777777" w:rsidTr="009564E8"><w:tc><w:tcPr><w:tcW w:w="1095" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="6876598F" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00FF5DD0"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Titre</w:t></w:r><w:r w:rsidR="00FF5DD0" w:rsidRPr="00C335CE"><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidRPr="00C335CE"><w:t>:</w:t></w:r></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="3264" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="1F56BDFC" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00AF69C7"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="900" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="5DC329C3" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00FF5DD0" w:rsidP="00AF69C7"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:proofErr w:type="gramStart"/><w:r w:rsidRPr="00C335CE"><w:t>Email</w:t></w:r><w:proofErr w:type="gramEnd"/><w:r w:rsidRPr="00C335CE"><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidR="00AF69C7" w:rsidRPr="00C335CE"><w:t>:</w:t></w:r></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="3246" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="2326070B" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00AF69C7"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>@orange.com</w:t></w:r></w:p></w:tc></w:tr><w:tr w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w14:paraId="470FE1CB" w14:textId="77777777" w:rsidTr="009564E8"><w:tc><w:tcPr><w:tcW w:w="1095" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="75B44613" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00FF5DD0"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Tél</w:t></w:r><w:r w:rsidR="00FF5DD0" w:rsidRPr="00C335CE"><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidRPr="00C335CE"><w:t>:</w:t></w:r></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="3264" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="1FF13B91" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00AF69C7"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="900" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="0486FC7A" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00FF5DD0"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Mobile</w:t></w:r><w:r w:rsidR="00FF5DD0" w:rsidRPr="00C335CE"><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidRPr="00C335CE"><w:t>:</w:t></w:r></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="3246" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="49907CC8" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00AF69C7"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr></w:p></w:tc></w:tr><w:tr w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w14:paraId="071A9502" w14:textId="77777777" w:rsidTr="009564E8"><w:trPr><w:cantSplit/></w:trPr><w:tc><w:tcPr><w:tcW w:w="1095" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="4755FA1E" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00D221F1" w:rsidP="00FF5DD0"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>A</w:t></w:r><w:r w:rsidR="00AF69C7" w:rsidRPr="00C335CE"><w:t>dresse</w:t></w:r><w:r w:rsidR="00FF5DD0" w:rsidRPr="00C335CE"><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidR="00AF69C7" w:rsidRPr="00C335CE"><w:t>:</w:t></w:r></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="7410" w:type="dxa"/><w:gridSpan w:val="3"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="486934C7" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00AF69C7"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr></w:p></w:tc></w:tr><w:tr w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w14:paraId="74B2F971" w14:textId="77777777" w:rsidTr="009564E8"><w:trPr><w:cantSplit/></w:trPr><w:tc><w:tcPr><w:tcW w:w="1095" w:type="dxa"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="21D3119F" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00AF69C7" w:rsidP="00FF5DD0"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Site Web</w:t></w:r><w:r w:rsidR="00FF5DD0" w:rsidRPr="00C335CE"><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidRPr="00C335CE"><w:t>:</w:t></w:r></w:p></w:tc><w:tc><w:tcPr><w:tcW w:w="7410" w:type="dxa"/><w:gridSpan w:val="3"/><w:vAlign w:val="center"/></w:tcPr><w:p w14:paraId="3957B477" w14:textId="77777777" w:rsidR="00AF69C7" w:rsidRPr="00C335CE" w:rsidRDefault="00C168A5" w:rsidP="00C168A5"><w:pPr><w:pStyle w:val="CoordonnesContacts"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>http://www.</w:t></w:r><w:r w:rsidR="00AF69C7" w:rsidRPr="00C335CE"><w:t>orange-business.com</w:t></w:r></w:p></w:tc></w:tr></w:tbl><w:p w14:paraId="11185772" w14:textId="77777777" w:rsidR="008A617E" w:rsidRPr="00C335CE" w:rsidRDefault="008A617E" w:rsidP="008A617E"><w:pPr><w:pStyle w:val="BodyText"/></w:pPr></w:p><w:p w14:paraId="6DDF5966" w14:textId="77777777" w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidRDefault="007132BD" w:rsidP="008A617E"><w:pPr><w:pStyle w:val="BodyText"/><w:sectPr w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidSect="002F63F5"><w:headerReference w:type="even" r:id="rId12"/><w:headerReference w:type="default" r:id="rId13"/><w:footerReference w:type="even" r:id="rId14"/><w:footerReference w:type="default" r:id="rId15"/><w:headerReference w:type="first" r:id="rId16"/><w:footerReference w:type="first" r:id="rId17"/><w:pgSz w:w="11906" w:h="16838" w:code="9"/><w:pgMar w:top="720" w:right="720" w:bottom="720" w:left="720" w:header="0" w:footer="0" w:gutter="0"/><w:cols w:space="708"/><w:titlePg/><w:docGrid w:linePitch="360"/></w:sectPr></w:pPr></w:p><w:p w14:paraId="2C75859B" w14:textId="77777777" w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidRDefault="007132BD" w:rsidP="00C168A5"><w:pPr><w:pStyle w:val="STitre1"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:lastRenderedPageBreak/><w:t xml:space="preserve">Table </w:t></w:r><w:r w:rsidR="00FA453C" w:rsidRPr="00C335CE"><w:t>des</w:t></w:r><w:r w:rsidRPr="00C335CE"><w:t xml:space="preserve"> </w:t></w:r><w:r w:rsidR="00FA453C" w:rsidRPr="00C335CE"><w:t>matières</w:t></w:r></w:p><w:p w14:paraId="33936659" w14:textId="77777777" w:rsidR="007132BD" w:rsidRPr="00C335CE" w:rsidRDefault="00000000" w:rsidP="00F16138"><w:pPr><w:pStyle w:val="TM1"/></w:pPr><w:r><w:fldChar w:fldCharType="begin"/></w:r><w:r><w:instrText xml:space="preserve"> TOC \\o "1-3" \\h \\z \\u </w:instrText></w:r><w:r><w:fldChar w:fldCharType="separate"/></w:r><w:r w:rsidR="001F5250" w:rsidRPr="00C335CE"><w:rPr><w:noProof/></w:rPr><w:t xml:space="preserve">Aucune entrée de table des </w:t></w:r><w:r w:rsidR="001F5250" w:rsidRPr="00C335CE"><w:t>matières</w:t></w:r><w:r w:rsidR="001F5250" w:rsidRPr="00C335CE"><w:rPr><w:noProof/></w:rPr><w:t xml:space="preserve"> n\'a été trouvée.</w:t></w:r><w:r><w:rPr><w:noProof/></w:rPr><w:fldChar w:fldCharType="end"/></w:r></w:p><w:p w14:paraId="38CA3268" w14:textId="77777777" w:rsidR="009A37C6" w:rsidRPr="00C335CE" w:rsidRDefault="009A37C6" w:rsidP="0048016E"><w:pPr><w:pStyle w:val="STitre1"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Liste des tableaux</w:t></w:r></w:p><w:p w14:paraId="3FFCA3F6" w14:textId="77777777" w:rsidR="009A37C6" w:rsidRPr="00C335CE" w:rsidRDefault="009A37C6" w:rsidP="002F4EEA"><w:pPr><w:pStyle w:val="Tabledesillustrations"/><w:rPr><w:lang w:val="fr-FR"/></w:rPr></w:pPr><w:r w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:fldChar w:fldCharType="begin"/></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:instrText xml:space="preserve"> TOC \\h \\z \\c "Tableau" </w:instrText></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:fldChar w:fldCharType="separate"/></w:r><w:r w:rsidR="002F4EEA" w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:t>Aucune entrée de table d\'illustration n\'a été trouvée.</w:t></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:fldChar w:fldCharType="end"/></w:r></w:p><w:p w14:paraId="5A2AE896" w14:textId="77777777" w:rsidR="009A37C6" w:rsidRPr="00C335CE" w:rsidRDefault="009A37C6" w:rsidP="0048016E"><w:pPr><w:pStyle w:val="STitre1"/></w:pPr><w:r w:rsidRPr="00C335CE"><w:t>Liste des figures</w:t></w:r></w:p><w:p w14:paraId="3B29A2A0" w14:textId="77777777" w:rsidR="009A37C6" w:rsidRPr="00C335CE" w:rsidRDefault="009A37C6" w:rsidP="002F4EEA"><w:pPr><w:pStyle w:val="Tabledesillustrations"/><w:rPr><w:lang w:val="fr-FR"/></w:rPr></w:pPr><w:r w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:fldChar w:fldCharType="begin"/></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:instrText xml:space="preserve"> TOC \\h \\z \\c "Figure" </w:instrText></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:fldChar w:fldCharType="separate"/></w:r><w:r w:rsidR="001F5250" w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:t>Aucune entrée de table d\'illustration n\'a été trouvée.</w:t></w:r><w:r w:rsidRPr="00C335CE"><w:rPr><w:lang w:val="fr-FR"/></w:rPr><w:fldChar w:fldCharType="end"/></w:r></w:p><w:p w14:paraId="4750AFB0" w14:textId="77777777" w:rsidR="00F36D8E" w:rsidRPr="00C335CE" w:rsidRDefault="00F36D8E" w:rsidP="00F36D8E"/><w:p w14:paraId="7D70CB6D" w14:textId="77777777" w:rsidR="000836AE" w:rsidRPr="00C335CE" w:rsidRDefault="000836AE" w:rsidP="000836AE"><w:pPr><w:pStyle w:val="BodyText"/><w:sectPr w:rsidR="000836AE" w:rsidRPr="00C335CE" w:rsidSect="005302A5"><w:headerReference w:type="even" r:id="rId18"/><w:headerReference w:type="default" r:id="rId19"/><w:footerReference w:type="even" r:id="rId20"/><w:footerReference w:type="default" r:id="rId21"/><w:headerReference w:type="first" r:id="rId22"/><w:footerReference w:type="first" r:id="rId23"/><w:pgSz w:w="11906" w:h="16838" w:code="9"/><w:pgMar w:top="720" w:right="720" w:bottom="720" w:left="720" w:header="0" w:footer="0" w:gutter="0"/><w:cols w:space="708"/><w:docGrid w:linePitch="360"/></w:sectPr></w:pPr></w:p><w:p w14:paraId="6B39024A" w14:textId="77777777" w:rsidR="001E4CDD" w:rsidRPr="00ED1502" w:rsidRDefault="001E4CDD" w:rsidP="00ED1502"><w:pPr><w:pStyle w:val="BodyText"/></w:pPr></w:p><w:p w14:paraId="0519CE18" w14:textId="3F897D6A" w:rsidR="0048016E" w:rsidRDefault="00ED038F" w:rsidP="00ED038F"><w:pPr><w:pStyle w:val="Titre10"/><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr></w:pPr><w:proofErr w:type="spellStart"/><w:r><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:lastRenderedPageBreak/><w:t>Ccc</w:t></w:r><w:proofErr w:type="spellEnd"/></w:p><w:p w14:paraId="3BC69584" w14:textId="48227D67" w:rsidR="00ED038F" w:rsidRDefault="00ED038F" w:rsidP="00ED038F"><w:pPr><w:pStyle w:val="Titre20"/></w:pPr><w:proofErr w:type="spellStart"/><w:r><w:t>Qsdd</w:t></w:r><w:proofErr w:type="spellEnd"/></w:p><w:p w14:paraId="747EE9A7" w14:textId="5DFB2DB0" w:rsidR="00ED038F" w:rsidRDefault="00947006" w:rsidP="00845F4B"><w:pPr><w:pStyle w:val="BodyText"/><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr></w:pPr><w:proofErr w:type="spellStart"/><w:r w:rsidRPr="00845F4B"><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t>Dsbvbvn</w:t></w:r><w:proofErr w:type="spellEnd"/></w:p><w:p w14:paraId="21497A0D" w14:textId="77777777" w:rsidR="00947006" w:rsidRPr="00845F4B" w:rsidRDefault="00947006" w:rsidP="00845F4B"><w:pPr><w:pStyle w:val="BodyText"/><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr></w:pPr></w:p><w:p w14:paraId="383106B5" w14:textId="5ACDB52E" w:rsidR="00947006" w:rsidRDefault="00947006" w:rsidP="00947006"><w:pPr><w:pStyle w:val="Bullet1"/><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr></w:pPr><w:proofErr w:type="spellStart"/><w:r><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t>Fezjfz</w:t></w:r><w:proofErr w:type="spellEnd"/></w:p><w:p w14:paraId="2C331B80" w14:textId="3A7D6E23" w:rsidR="00947006" w:rsidRDefault="00947006" w:rsidP="00947006"><w:pPr><w:pStyle w:val="Bullet1"/><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr></w:pPr><w:proofErr w:type="spellStart"/><w:r><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t>Jzekkfjk</w:t></w:r><w:proofErr w:type="spellEnd"/></w:p><w:p w14:paraId="603BA9F8" w14:textId="32546CFA" w:rsidR="00947006" w:rsidRPr="00845F4B" w:rsidRDefault="00845F4B" w:rsidP="00845F4B"><w:pPr><w:pStyle w:val="BodyText"/><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr></w:pPr><w:r w:rsidRPr="00845F4B"><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t xml:space="preserve"> </w:t></w:r><w:proofErr w:type="spellStart"/><w:r w:rsidRPr="00845F4B"><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t>Nf</w:t></w:r><w:proofErr w:type="spellEnd"/><w:r w:rsidRPr="00845F4B"><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t xml:space="preserve"> </w:t></w:r><w:proofErr w:type="spellStart"/><w:r w:rsidRPr="00845F4B"><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t>nvf</w:t></w:r><w:proofErr w:type="spellEnd"/><w:r w:rsidRPr="00845F4B"><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t xml:space="preserve"> </w:t></w:r><w:proofErr w:type="spellStart"/><w:proofErr w:type="gramStart"/><w:r w:rsidRPr="00845F4B"><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t>z,v</w:t></w:r><w:proofErr w:type="spellEnd"/><w:proofErr w:type="gramEnd"/><w:r w:rsidRPr="00845F4B"><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr><w:t>$</w:t></w:r></w:p><w:p w14:paraId="1E6020A0" w14:textId="77777777" w:rsidR="00845F4B" w:rsidRPr="00845F4B" w:rsidRDefault="00845F4B" w:rsidP="00845F4B"><w:pPr><w:pStyle w:val="BodyText"/><w:rPr><w:rStyle w:val="TexteOrange"/></w:rPr></w:pPr></w:p><w:sectPr w:rsidR="00845F4B" w:rsidRPr="00845F4B" w:rsidSect="00502252"><w:headerReference w:type="default" r:id="rId24"/><w:footerReference w:type="default" r:id="rId25"/><w:pgSz w:w="11906" w:h="16838" w:code="9"/><w:pgMar w:top="720" w:right="720" w:bottom="720" w:left="720" w:header="0" w:footer="0" w:gutter="0"/><w:cols w:space="708"/><w:docGrid w:linePitch="360"/></w:sectPr></w:body></w:document>
|
data/templates/Template_presentation.docx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93d264f72e69de63159803b9676a6d28e30946b478151f802f135798a2a71f71
|
3 |
+
size 146771
|
requirements.txt
ADDED
Binary file (5.38 kB). View file
|
|
src/control/controller.py
ADDED
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import os
|
3 |
+
from typing import Dict
|
4 |
+
import random
|
5 |
+
import datetime
|
6 |
+
import string
|
7 |
+
import docx
|
8 |
+
from src.tools.doc_tools import get_title
|
9 |
+
from src.domain.doc import Doc
|
10 |
+
from src.domain.wikidoc import WikiPage
|
11 |
+
from src.view.log_msg import create_msg_from
|
12 |
+
import src.tools.semantic_db as semantic_db
|
13 |
+
from src.tools.wiki import Wiki
|
14 |
+
from src.llm.llm_tools import get_wikilist, get_public_paragraph, get_private_paragraph
|
15 |
+
from src.tools.semantic_db import add_texts_to_collection, query_collection
|
16 |
+
import gradio as gr
|
17 |
+
from src.retriever.retriever import Retriever
|
18 |
+
|
19 |
+
class Controller:
|
20 |
+
|
21 |
+
def __init__(self, config: Dict, client_db, retriever):
|
22 |
+
self.templates_path = config['templates_path']
|
23 |
+
self.generated_docs_path = config['generated_docs_path']
|
24 |
+
self.styled_docs_path = config['styled_docs_path']
|
25 |
+
self.new_docs = []
|
26 |
+
self.gen_docs = []
|
27 |
+
self.input_csv = ""
|
28 |
+
template_path = config['templates_path'] + '/' + config['templates'][config['default_template_index']]
|
29 |
+
self.default_template = Doc(template_path)
|
30 |
+
self.template = self.default_template
|
31 |
+
self.log = []
|
32 |
+
self.differences = []
|
33 |
+
self.list_differences = []
|
34 |
+
self.client_db = client_db
|
35 |
+
self.retriever = retriever
|
36 |
+
|
37 |
+
def copy_docs(self, temp_docs: []):
|
38 |
+
"""
|
39 |
+
Initial copy of the incoming document
|
40 |
+
+
|
41 |
+
create collection for requirments retrieval
|
42 |
+
+
|
43 |
+
Initiate paths
|
44 |
+
|
45 |
+
TODO: Rename or refactor the function -> 1 mission / function
|
46 |
+
TODO: To be tested on several documents
|
47 |
+
TODO: Rename create_collection in create_requirement_collection
|
48 |
+
"""
|
49 |
+
doc_names = [doc.name for doc in temp_docs]
|
50 |
+
for i in range(len(doc_names)):
|
51 |
+
if '/' in doc_names[i]:
|
52 |
+
doc_names[i] = doc_names[i].split('/')[-1]
|
53 |
+
elif '\\' in doc_names[i]:
|
54 |
+
doc_names[i] = doc_names[i].split('\\')[-1]
|
55 |
+
doc_names[i] = doc_names[i].split('.')[0]
|
56 |
+
docs = [Doc(path=doc.name) for doc in temp_docs]
|
57 |
+
self.create_collection(docs)
|
58 |
+
style_paths = [f"{self.generated_docs_path}/{dn}_.docx" for dn in doc_names]
|
59 |
+
gen_paths = [f"{self.generated_docs_path}/{dn}_e.docx" for dn in doc_names]
|
60 |
+
for doc, style_path, gen_path in zip(docs, style_paths, gen_paths):
|
61 |
+
new_doc = doc.copy(style_path)
|
62 |
+
self.new_docs.append(new_doc)
|
63 |
+
|
64 |
+
def clear_docs(self):
|
65 |
+
for new_doc in self.new_docs:
|
66 |
+
if os.path.exists(new_doc.path):
|
67 |
+
new_doc.clear()
|
68 |
+
for gen_doc in self.gen_docs:
|
69 |
+
if os.path.exists(gen_doc.path):
|
70 |
+
gen_doc.clear()
|
71 |
+
self.new_docs = []
|
72 |
+
self.gen_docs = []
|
73 |
+
self.log = []
|
74 |
+
path_to_clear = os.path.abspath(self.generated_docs_path)
|
75 |
+
[os.remove(f"{path_to_clear}/{doc}") for doc in os.listdir(path_to_clear)]
|
76 |
+
|
77 |
+
def set_template(self, template_name: str = ""):
|
78 |
+
if not template_name:
|
79 |
+
self.template = self.default_template
|
80 |
+
else:
|
81 |
+
template_path = f"{self.templates_path}/{template_name}"
|
82 |
+
self.template = Doc(template_path)
|
83 |
+
|
84 |
+
def add_template(self, template_path: str):
|
85 |
+
"""
|
86 |
+
TODO: message to be but in config
|
87 |
+
"""
|
88 |
+
if not template_path:
|
89 |
+
return
|
90 |
+
elif not template_path.name.endswith(".docx"):
|
91 |
+
gr.Warning("Seuls les fichiers .docx sont acceptés")
|
92 |
+
return
|
93 |
+
doc = docx.Document(template_path.name)
|
94 |
+
doc.save(self.templates_path + '/' + get_title(template_path.name))
|
95 |
+
|
96 |
+
def delete_curr_template(self, template_name: str):
|
97 |
+
if not template_name:
|
98 |
+
return
|
99 |
+
os.remove(f"{self.templates_path}/{template_name}")
|
100 |
+
|
101 |
+
def retrieve_number_of_misapplied_styles(self):
|
102 |
+
"""
|
103 |
+
not used: buggy !!
|
104 |
+
"""
|
105 |
+
res = {}
|
106 |
+
for new_doc in self.new_docs:
|
107 |
+
res[new_doc] = new_doc.retrieve_number_of_misapplied_styles()
|
108 |
+
return res
|
109 |
+
|
110 |
+
def get_difference_with_template(self):
|
111 |
+
self.differences = []
|
112 |
+
for new_doc in self.new_docs:
|
113 |
+
diff_styles = new_doc.get_different_styles_with_template(template=self.template)
|
114 |
+
diff_dicts = [{'doc': new_doc, 'style': s} for s in diff_styles]
|
115 |
+
self.differences += diff_dicts
|
116 |
+
template_styles = self.template.xdoc.styles
|
117 |
+
template_styles = [style for style in template_styles if style.name in self.template.styles.names]
|
118 |
+
return self.differences, template_styles
|
119 |
+
|
120 |
+
def get_list_styles(self):
|
121 |
+
self.list_differences = []
|
122 |
+
for new_doc in self.new_docs:
|
123 |
+
list_styles = new_doc.get_list_styles()
|
124 |
+
all_lists_styles = [{'doc': new_doc, 'list_style': s} for s in list_styles]
|
125 |
+
self.list_differences += all_lists_styles
|
126 |
+
return self.list_differences
|
127 |
+
|
128 |
+
def map_style(self, this_style_index: int, template_style_name: str):
|
129 |
+
"""
|
130 |
+
maps a style from 'this' document into a style from the template
|
131 |
+
"""
|
132 |
+
#dont make any change if the style is already the same
|
133 |
+
diff_dict = self.differences[this_style_index]
|
134 |
+
doc = diff_dict['doc']
|
135 |
+
this_style_name = diff_dict['style']
|
136 |
+
log = doc.copy_one_style(this_style_name, template_style_name, self.template)
|
137 |
+
if log:
|
138 |
+
self.log.append({doc.name: log})
|
139 |
+
|
140 |
+
def update_list_style(self, this_style_index: int, template_style_name: str):
|
141 |
+
"""
|
142 |
+
maps a style from 'this' document into a style from the template
|
143 |
+
"""
|
144 |
+
#dont make any change if the style is already the same
|
145 |
+
diff_dict = self.list_differences[this_style_index]
|
146 |
+
doc = diff_dict['doc']
|
147 |
+
this_style_name = diff_dict['list_style']
|
148 |
+
log = doc.change_bullet_style(this_style_name, template_style_name, self.template)
|
149 |
+
if log:
|
150 |
+
self.log.append({doc.name: log})
|
151 |
+
|
152 |
+
def update_style(self,index,style_to_modify):
|
153 |
+
return self.map_style(index, style_to_modify) if style_to_modify else None
|
154 |
+
|
155 |
+
def apply_template(self, options_list):
|
156 |
+
for new_doc in self.new_docs:
|
157 |
+
log = new_doc.apply_template(template=self.template, options_list=options_list)
|
158 |
+
if log:
|
159 |
+
self.log.append({new_doc.name: log})
|
160 |
+
|
161 |
+
def reset(self):
|
162 |
+
for new_doc in self.new_docs:
|
163 |
+
new_doc.delete()
|
164 |
+
for gen_doc in self.gen_docs:
|
165 |
+
gen_doc.delete()
|
166 |
+
self.new_docs = []
|
167 |
+
self.gen_docs = []
|
168 |
+
|
169 |
+
|
170 |
+
def get_log(self):
|
171 |
+
msg_log = create_msg_from(self.log, self.new_docs)
|
172 |
+
return msg_log
|
173 |
+
|
174 |
+
"""
|
175 |
+
Source Control
|
176 |
+
"""
|
177 |
+
|
178 |
+
def get_or_create_collection(self, id_: str) -> str:
|
179 |
+
"""
|
180 |
+
generates a new id if needed
|
181 |
+
TODO: rename into get_or_create_generation_collection
|
182 |
+
TODO: have a single DB with separate collections, one for requirements, one for generation
|
183 |
+
"""
|
184 |
+
if id_ != '-1':
|
185 |
+
return id_
|
186 |
+
else:
|
187 |
+
now = datetime.datetime.now().strftime("%m%d%H%M")
|
188 |
+
letters = string.ascii_lowercase + string.digits
|
189 |
+
id_ = now + '-' + ''.join(random.choice(letters) for _ in range(10))
|
190 |
+
semantic_db.get_or_create_collection(id_)
|
191 |
+
return id_
|
192 |
+
|
193 |
+
async def wiki_fetch(self) -> [str]:
|
194 |
+
"""
|
195 |
+
returns the title of the wikipages corresponding to the tasks described in the input text
|
196 |
+
"""
|
197 |
+
all_tasks = []
|
198 |
+
for new_doc in self.new_docs:
|
199 |
+
all_tasks += new_doc.tasks
|
200 |
+
async_tasks = [asyncio.create_task(get_wikilist(task)) for task in all_tasks]
|
201 |
+
wiki_lists = await asyncio.gather(*async_tasks)
|
202 |
+
flatten_wiki_list = list(set().union(*[set(w) for w in wiki_lists]))
|
203 |
+
return flatten_wiki_list
|
204 |
+
|
205 |
+
async def wiki_upload_and_store(self, wiki_title: str, collection_name: str):
|
206 |
+
"""
|
207 |
+
uploads one wikipage and stores them into the right collection
|
208 |
+
"""
|
209 |
+
wikipage = Wiki().fetch(wiki_title)
|
210 |
+
wiki_title = wiki_title
|
211 |
+
if type(wikipage) != str:
|
212 |
+
texts = WikiPage(wikipage.page_content).get_paragraphs()
|
213 |
+
add_texts_to_collection(coll_name=collection_name, texts=texts, file=wiki_title, source='wiki')
|
214 |
+
else:
|
215 |
+
print(wikipage)
|
216 |
+
|
217 |
+
"""
|
218 |
+
Generate Control
|
219 |
+
"""
|
220 |
+
|
221 |
+
|
222 |
+
async def generate_doc_from_db(self, collection_name: str, from_files: [str]) -> [str]:
|
223 |
+
|
224 |
+
def query_from_task(task):
|
225 |
+
return get_public_paragraph(task)
|
226 |
+
|
227 |
+
async def retrieve_text_and_generate(t, collection_name: str, from_files: [str]):
|
228 |
+
"""
|
229 |
+
retreives the texts from the database and generates the documents
|
230 |
+
"""
|
231 |
+
# retreive the texts from the database
|
232 |
+
task_query = query_from_task(t)
|
233 |
+
texts = query_collection(coll_name=collection_name, query=task_query, from_files=from_files)
|
234 |
+
task_resolutions = get_private_paragraph(task=t, texts=texts)
|
235 |
+
return task_resolutions
|
236 |
+
|
237 |
+
async def real_doc_generation(new_doc):
|
238 |
+
async_task_resolutions = [asyncio.create_task(retrieve_text_and_generate(t=task, collection_name=collection_name, from_files=from_files))
|
239 |
+
for task in new_doc.tasks]
|
240 |
+
tasks_resolutions = await asyncio.gather(*async_task_resolutions) #A VOIR
|
241 |
+
gen_path = f"{self.generated_docs_path}/{new_doc.name}e.docx"
|
242 |
+
gen_doc = new_doc.copy(gen_path)
|
243 |
+
gen_doc.replace_tasks(tasks_resolutions)
|
244 |
+
gen_doc.save_as_docx()
|
245 |
+
gen_paths.append(gen_doc.path)
|
246 |
+
self.gen_docs.append(gen_doc)
|
247 |
+
return gen_paths
|
248 |
+
|
249 |
+
gen_paths = []
|
250 |
+
gen_paths = await asyncio.gather(*[asyncio.create_task(real_doc_generation(new_doc)) for new_doc in self.new_docs])
|
251 |
+
gen_paths = [path for sublist in gen_paths for path in sublist]
|
252 |
+
gen_paths = list(set(gen_paths))
|
253 |
+
return gen_paths
|
254 |
+
|
255 |
+
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
|
260 |
+
"""
|
261 |
+
Requirements
|
262 |
+
"""
|
263 |
+
|
264 |
+
def set_input_csv(self, csv_path: str):
|
265 |
+
"""
|
266 |
+
TODO: rename to set_requirements_file
|
267 |
+
"""
|
268 |
+
self.input_csv = csv_path
|
269 |
+
|
270 |
+
def create_collection(self, docs: [Doc]):
|
271 |
+
"""
|
272 |
+
TODO: rename to create_requirements_collection
|
273 |
+
TODO: merge with semantic tool to have only one DB Object
|
274 |
+
"""
|
275 |
+
coll_name = "collection_for_docs"
|
276 |
+
collection = self.client_db.get_or_create_collection(coll_name)
|
277 |
+
for doc in docs:
|
278 |
+
self.fill_collection(doc, collection)
|
279 |
+
self.retriever.collection = collection
|
280 |
+
|
281 |
+
def fill_collection(self, doc: Doc, collection: str):
|
282 |
+
"""
|
283 |
+
fills the collection with the blocks of the documents
|
284 |
+
"""
|
285 |
+
Retriever(doc=doc, collection=collection)
|
src/domain/block.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
class Block:
|
4 |
+
def __init__(self, doc: str = '', title: str = '', content: str = '', content_fr: str = '',
|
5 |
+
index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
|
6 |
+
self.doc = doc
|
7 |
+
self.title = title
|
8 |
+
self.title_fr = ""
|
9 |
+
self.content = content
|
10 |
+
self.content_fr = content_fr
|
11 |
+
self.specials = []
|
12 |
+
self.index = index
|
13 |
+
self.rank = rank
|
14 |
+
self.level = level
|
15 |
+
self.distance = distance
|
16 |
+
|
17 |
+
|
18 |
+
def separate_1_block_in_n(self, max_size=4500):
|
19 |
+
"""
|
20 |
+
Separate a block in n blocks of equal size
|
21 |
+
"""
|
22 |
+
content_length = len(self.content)
|
23 |
+
n = math.ceil(content_length / max_size)
|
24 |
+
block_size = content_length // n
|
25 |
+
new_blocks = []
|
26 |
+
for i in range(n):
|
27 |
+
start = i * block_size
|
28 |
+
end = (i + 1) * block_size if i < n - 1 else None
|
29 |
+
new_blocks.append(Block(doc=self.doc,
|
30 |
+
title=self.title + f"_part{i}",
|
31 |
+
content=self.content[start:end],
|
32 |
+
index=self.index + f"_{i}",
|
33 |
+
rank=self.rank,
|
34 |
+
level=self.level))
|
35 |
+
return new_blocks
|
36 |
+
|
37 |
+
def to_dict(self) -> {}:
|
38 |
+
block_dict = {'doc': self.doc,
|
39 |
+
'title': self.title,
|
40 |
+
'title_fr': self.title_fr,
|
41 |
+
'content': self.content,
|
42 |
+
'content_fr': self.content_fr,
|
43 |
+
'index': self.index,
|
44 |
+
'rank': self.rank,
|
45 |
+
'level': self.level,
|
46 |
+
'distance': self.distance}
|
47 |
+
for i, s in enumerate(self.specials):
|
48 |
+
special_key = 'special_'+str(i)
|
49 |
+
block_dict[special_key] = s
|
50 |
+
block_dict['specials_len'] = len(self.specials)
|
51 |
+
return block_dict
|
52 |
+
|
53 |
+
def from_dict(self, block_dict: {}):
|
54 |
+
self.doc = block_dict['doc']
|
55 |
+
self.title = block_dict['title']
|
56 |
+
self.title_fr = block_dict['title_fr']
|
57 |
+
self.content = block_dict['content']
|
58 |
+
self.content_fr = block_dict['content_fr']
|
59 |
+
self.index = block_dict['index']
|
60 |
+
self.rank = block_dict['rank']
|
61 |
+
self.level = block_dict['level']
|
62 |
+
self.distance = block_dict['distance']
|
63 |
+
self.specials = []
|
64 |
+
for i in range(block_dict['specials_len']):
|
65 |
+
special_key = 'special_' + str(i)
|
66 |
+
self.specials.append(block_dict[special_key])
|
67 |
+
return self
|
68 |
+
|
69 |
+
@property
|
70 |
+
def distance_str(self) -> str:
|
71 |
+
return format(self.distance, '.2f')
|
src/domain/container.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.domain.paragraph import Paragraph
|
2 |
+
from src.domain.block import Block
|
3 |
+
|
4 |
+
INFINITE = 10000
|
5 |
+
|
6 |
+
|
7 |
+
class Container:
|
8 |
+
|
9 |
+
def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, index: [int] = None,
|
10 |
+
father=None, id_=0):
|
11 |
+
"""
|
12 |
+
should add some summary or infos on content (by a priori generation)
|
13 |
+
"""
|
14 |
+
if index is None:
|
15 |
+
index = []
|
16 |
+
self.level = level
|
17 |
+
if not self.level:
|
18 |
+
pass
|
19 |
+
self.title = title
|
20 |
+
self.paragraphs = []
|
21 |
+
self.all_paragraphs = paragraphs
|
22 |
+
self.children = []
|
23 |
+
self.index = index
|
24 |
+
self.father = father # if not father, then the container is at the top of the hierarchy
|
25 |
+
self.id_ = int(str(1) + str(father.id_) + str(id_))
|
26 |
+
if paragraphs:
|
27 |
+
self.paragraphs, self.children = self.create_children(paragraphs.copy(), level, index)
|
28 |
+
self.containers = [self]
|
29 |
+
for child in self.children:
|
30 |
+
self.containers += child.containers
|
31 |
+
self.blocks = self.get_blocks()
|
32 |
+
self.normal, self.comment, self.task, _ = self.sort_paragraphs()
|
33 |
+
|
34 |
+
self.one_liner = (self.title.text if self.title else '') + ' ' + self.comment
|
35 |
+
self.root_text = self.one_liner + ' ' + self.normal
|
36 |
+
|
37 |
+
|
38 |
+
@property
|
39 |
+
def text(self):
|
40 |
+
text = ""
|
41 |
+
if self.title:
|
42 |
+
text = "Titre " + str(self.level) + " : " + self.title.text + '\n'
|
43 |
+
for p in self.paragraphs:
|
44 |
+
text += p.text + '\n'
|
45 |
+
for child in self.children:
|
46 |
+
text += child.text
|
47 |
+
return text
|
48 |
+
|
49 |
+
@property
|
50 |
+
def table_of_contents(self):
|
51 |
+
"""
|
52 |
+
Not used
|
53 |
+
"""
|
54 |
+
toc = []
|
55 |
+
if self.title:
|
56 |
+
toc += [{str(self.level): self.title.text}]
|
57 |
+
if self.children:
|
58 |
+
for child in self.children:
|
59 |
+
toc += child.table_of_contents
|
60 |
+
return toc
|
61 |
+
|
62 |
+
def move(self, position: int, new_father=None):
|
63 |
+
"""
|
64 |
+
Not used
|
65 |
+
"""
|
66 |
+
current_father = self.father
|
67 |
+
current_father.children.remove(self)
|
68 |
+
|
69 |
+
self.rank = new_father.rank + 1 if new_father else 0
|
70 |
+
self.father = new_father
|
71 |
+
if position < len(new_father.children):
|
72 |
+
new_father.children.insert(position, self)
|
73 |
+
else:
|
74 |
+
new_father.children.append(self)
|
75 |
+
|
76 |
+
def create_children(self, paragraphs, level, rank) -> ([], []):
|
77 |
+
"""
|
78 |
+
creates children containers or directly attached content
|
79 |
+
and returns the list of containers and contents of level+1
|
80 |
+
:return:
|
81 |
+
[Content or Container]
|
82 |
+
"""
|
83 |
+
attached_paragraphs = []
|
84 |
+
container_paragraphs = []
|
85 |
+
container_title = None
|
86 |
+
children = []
|
87 |
+
in_children = False
|
88 |
+
level = INFINITE
|
89 |
+
child_id = 0
|
90 |
+
|
91 |
+
while paragraphs:
|
92 |
+
p = paragraphs.pop(0)
|
93 |
+
if not in_children and not p.is_structure:
|
94 |
+
attached_paragraphs.append(p)
|
95 |
+
else:
|
96 |
+
in_children = True
|
97 |
+
if p.is_structure and p.level <= level: # if p is higher or equal in hierarchy
|
98 |
+
if container_paragraphs or container_title:
|
99 |
+
children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
|
100 |
+
child_id += 1
|
101 |
+
container_paragraphs = []
|
102 |
+
container_title = p
|
103 |
+
level = p.level
|
104 |
+
|
105 |
+
else: # p is strictly lower in hierarchy
|
106 |
+
container_paragraphs.append(p)
|
107 |
+
|
108 |
+
if container_paragraphs or container_title:
|
109 |
+
children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
|
110 |
+
child_id += 1
|
111 |
+
|
112 |
+
return attached_paragraphs, children
|
113 |
+
|
114 |
+
@property
|
115 |
+
def structure(self):
|
116 |
+
|
117 |
+
self_structure = {str(self.id_): {
|
118 |
+
'index': str(self.id_),
|
119 |
+
'canMove': True,
|
120 |
+
'isFolder': True,
|
121 |
+
'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
|
122 |
+
'canRename': True,
|
123 |
+
'data': {},
|
124 |
+
'level': self.level,
|
125 |
+
'title': self.title.text if self.title else 'root'
|
126 |
+
}}
|
127 |
+
paragraphs_structure = [p.structure for p in self.paragraphs]
|
128 |
+
structure = [self_structure] + paragraphs_structure
|
129 |
+
for child in self.children:
|
130 |
+
structure += child.structure
|
131 |
+
return structure
|
132 |
+
|
133 |
+
def get_lang(self):
|
134 |
+
"""
|
135 |
+
returns the main language of the document
|
136 |
+
:return:
|
137 |
+
"""
|
138 |
+
|
139 |
+
def get_structure(self, level=2):
|
140 |
+
"""
|
141 |
+
returns the structure of the document
|
142 |
+
:return:
|
143 |
+
"""
|
144 |
+
|
145 |
+
def create_embeddings(self):
|
146 |
+
"""
|
147 |
+
|
148 |
+
:return:
|
149 |
+
"""
|
150 |
+
|
151 |
+
def get_blocks(self):
|
152 |
+
block = Block(level=self.level, index=self.index)
|
153 |
+
if self.title:
|
154 |
+
block.title = self.title.text
|
155 |
+
for p in self.paragraphs:
|
156 |
+
if not p.blank:
|
157 |
+
if p.text.startswith('##### '):
|
158 |
+
special_action = p.text.lstrip('##### ')
|
159 |
+
block.specials.append(special_action)
|
160 |
+
else:
|
161 |
+
block.content += p.text
|
162 |
+
blocks = [block] if block.content or block.specials else []
|
163 |
+
for child in self.children:
|
164 |
+
blocks += child.blocks
|
165 |
+
return blocks
|
166 |
+
|
167 |
+
def get_fulltask(self, doc_one_liner):
|
168 |
+
index = 0
|
169 |
+
siblings_ = []
|
170 |
+
if isinstance(self.father, Container):
|
171 |
+
siblings_ = self.father.children.copy()
|
172 |
+
index = siblings_.index(self)
|
173 |
+
siblings_before_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if idx < index]
|
174 |
+
siblings_after_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if index < idx]
|
175 |
+
|
176 |
+
fulltask = {'description': self.task,
|
177 |
+
'about': self.one_liner,
|
178 |
+
'doc_description': doc_one_liner,
|
179 |
+
'above': self.father.one_liner if isinstance(self.father, Container) else '',
|
180 |
+
'before': siblings_before_context,
|
181 |
+
'after': siblings_after_context}
|
182 |
+
return fulltask
|
183 |
+
|
184 |
+
def sort_paragraphs(self) -> (str, str, str, str):
|
185 |
+
mapping = {'normal': '', 'comment': '', 'task': '', 'title': ''}
|
186 |
+
for p in self.paragraphs:
|
187 |
+
mapping[p.type] += ' ' + p.parsed_text
|
188 |
+
return mapping['normal'], mapping['comment'], mapping['task'], mapping['title']
|
189 |
+
|
190 |
+
def get_all_styles_used_in_doc_except_list(self):
|
191 |
+
"""
|
192 |
+
loop in doc? rather thann in container? (since it applies only to container of level 0)
|
193 |
+
"""
|
194 |
+
styles = []
|
195 |
+
for p in self.all_paragraphs:
|
196 |
+
styles.append(p.get_styles_in_paragraph_except_list())
|
197 |
+
res = []
|
198 |
+
#flatten the list
|
199 |
+
temp = [item for sublist in styles for item in sublist]
|
200 |
+
names = [style.name for style in temp]
|
201 |
+
for s in temp:
|
202 |
+
if s.name in names:
|
203 |
+
res.append(s)
|
204 |
+
names.remove(s.name)
|
205 |
+
return res
|
206 |
+
|
207 |
+
def get_list_styles(self):
|
208 |
+
styles = []
|
209 |
+
for p in self.all_paragraphs:
|
210 |
+
styles.append(p.get_list_styles())
|
211 |
+
res = list(set().union(*styles))
|
212 |
+
return res
|
213 |
+
|
214 |
+
def retrieve_number_of_misapplied_styles(self):
|
215 |
+
res = 0
|
216 |
+
for p in self.all_paragraphs:
|
217 |
+
if p.style_misapplied:
|
218 |
+
res += 1
|
219 |
+
return res
|
src/domain/container_requirements.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.domain.paragraph import Paragraph
|
2 |
+
from src.domain.block import Block
|
3 |
+
|
4 |
+
INFINITE = 10000
|
5 |
+
|
6 |
+
|
7 |
+
class Container_requirements:
|
8 |
+
|
9 |
+
def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, index: [int] = None,
|
10 |
+
father=None, id_=0):
|
11 |
+
if index is None:
|
12 |
+
index = []
|
13 |
+
self.level = level
|
14 |
+
if not self.level:
|
15 |
+
pass
|
16 |
+
self.title = title
|
17 |
+
self.paragraphs = []
|
18 |
+
self.all_paragraphs = paragraphs
|
19 |
+
self.children = []
|
20 |
+
self.index = index
|
21 |
+
self.father = father # if not father, then the container is at the top of the hierarchy
|
22 |
+
self.id_ = int(str(1) + str(father.id_) + str(id_))
|
23 |
+
if paragraphs:
|
24 |
+
self.paragraphs, self.children = self.create_children(paragraphs.copy(), level, index)
|
25 |
+
self.containers = [self]
|
26 |
+
for child in self.children:
|
27 |
+
self.containers += child.containers
|
28 |
+
self.blocks = self.get_blocks_requirements()
|
29 |
+
|
30 |
+
|
31 |
+
@property
|
32 |
+
def text(self):
|
33 |
+
text = ""
|
34 |
+
if self.title:
|
35 |
+
text = "Titre " + str(self.level) + " : " + self.title.text + '\n'
|
36 |
+
for p in self.paragraphs:
|
37 |
+
text += p.text + '\n'
|
38 |
+
for child in self.children:
|
39 |
+
text += child.text
|
40 |
+
return text
|
41 |
+
|
42 |
+
|
43 |
+
def move(self, position: int, new_father=None):
|
44 |
+
current_father = self.father # should be added in the domain
|
45 |
+
current_father.children.remove(self)
|
46 |
+
|
47 |
+
self.rank = new_father.rank + 1 if new_father else 0
|
48 |
+
self.father = new_father
|
49 |
+
if position < len(new_father.children):
|
50 |
+
new_father.children.insert(position, self)
|
51 |
+
else:
|
52 |
+
new_father.children.append(self)
|
53 |
+
|
54 |
+
def create_children(self, paragraphs, level, rank) -> ([], []):
|
55 |
+
"""
|
56 |
+
creates children containers or directly attached content
|
57 |
+
and returns the list of containers and contents of level+1
|
58 |
+
:return:
|
59 |
+
[Content or Container]
|
60 |
+
"""
|
61 |
+
attached_paragraphs = []
|
62 |
+
container_paragraphs = []
|
63 |
+
container_title = None
|
64 |
+
children = []
|
65 |
+
in_children = False
|
66 |
+
level = INFINITE
|
67 |
+
child_id = 0
|
68 |
+
|
69 |
+
while paragraphs:
|
70 |
+
p = paragraphs.pop(0)
|
71 |
+
if not in_children and not p.is_structure:
|
72 |
+
attached_paragraphs.append(p)
|
73 |
+
else:
|
74 |
+
in_children = True
|
75 |
+
if p.is_structure and p.level <= level: # if p is higher or equal in hierarchy
|
76 |
+
if container_paragraphs or container_title:
|
77 |
+
children.append(Container_requirements(container_paragraphs, container_title, level, rank, self, child_id))
|
78 |
+
child_id += 1
|
79 |
+
container_paragraphs = []
|
80 |
+
container_title = p
|
81 |
+
level = p.level
|
82 |
+
|
83 |
+
else: # p is strictly lower in hierarchy
|
84 |
+
container_paragraphs.append(p)
|
85 |
+
|
86 |
+
if container_paragraphs or container_title:
|
87 |
+
children.append(Container_requirements(container_paragraphs, container_title, level, rank, self, child_id))
|
88 |
+
child_id += 1
|
89 |
+
|
90 |
+
return attached_paragraphs, children
|
91 |
+
|
92 |
+
@property
|
93 |
+
def structure(self):
|
94 |
+
|
95 |
+
self_structure = {str(self.id_): {
|
96 |
+
'index': str(self.id_),
|
97 |
+
'canMove': True,
|
98 |
+
'isFolder': True,
|
99 |
+
'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
|
100 |
+
'canRename': True,
|
101 |
+
'data': {},
|
102 |
+
'level': self.level,
|
103 |
+
'title': self.title.text if self.title else 'root'
|
104 |
+
}}
|
105 |
+
paragraphs_structure = [p.structure for p in self.paragraphs]
|
106 |
+
structure = [self_structure] + paragraphs_structure
|
107 |
+
for child in self.children:
|
108 |
+
structure += child.structure
|
109 |
+
return structure
|
110 |
+
|
111 |
+
def get_blocks_requirements(self):
|
112 |
+
block = Block(level=self.level, index=self.index)
|
113 |
+
if self.title:
|
114 |
+
self.title.text = self.title.text.replace('\r', '').replace('\n', '')
|
115 |
+
block.title = self.title.text
|
116 |
+
block.content = self.title.text + '/'
|
117 |
+
temp_father = self.father
|
118 |
+
while temp_father and type(temp_father) == Container_requirements:
|
119 |
+
if temp_father.title:
|
120 |
+
temp_father.title.text = temp_father.title.text.replace('\r', '').replace('\n', '')
|
121 |
+
block.content = temp_father.title.text + '/' + block.content
|
122 |
+
temp_father = temp_father.father
|
123 |
+
block.content += " :\n\n"
|
124 |
+
i = 0
|
125 |
+
for p in self.paragraphs:
|
126 |
+
if not p.blank:
|
127 |
+
i = 1
|
128 |
+
if p.text.startswith('##### '):
|
129 |
+
special_action = p.text.lstrip('##### ')
|
130 |
+
block.specials.append(special_action)
|
131 |
+
else:
|
132 |
+
block.content += p.text
|
133 |
+
if i == 0:
|
134 |
+
blocks = []
|
135 |
+
else:
|
136 |
+
blocks = [block]
|
137 |
+
for child in self.children:
|
138 |
+
blocks += child.blocks
|
139 |
+
return blocks
|
140 |
+
|
src/domain/doc.py
ADDED
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import docx
|
2 |
+
from src.tools.doc_tools import *
|
3 |
+
from docxcompose.composer import Composer
|
4 |
+
from docx import Document as Document_compose
|
5 |
+
from docx.enum.table import WD_TABLE_ALIGNMENT
|
6 |
+
from src.domain.container import Container
|
7 |
+
from src.domain.container_requirements import Container_requirements
|
8 |
+
from src.domain.paragraph import Paragraph
|
9 |
+
from src.domain.styles import Styles
|
10 |
+
import shutil
|
11 |
+
import os
|
12 |
+
from docx.oxml.ns import qn
|
13 |
+
from docx.oxml.shared import OxmlElement
|
14 |
+
from docx.shared import Inches
|
15 |
+
from src.tools.pretty_print import pretty_print_block_and_indexes, pretty_print_paragraphs
|
16 |
+
from src.tools.index_creation import set_indexes
|
17 |
+
from src.reader.reader_for_requirements import WordReader
|
18 |
+
|
19 |
+
class Doc:
|
20 |
+
|
21 |
+
"""
|
22 |
+
TODO: mettre _ devant les méthodes internes
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(self, path='', id_=None):
|
26 |
+
self.xdoc = docx.Document(path)
|
27 |
+
self.title = get_title(path)
|
28 |
+
self.name = self.title.split('.')[0]
|
29 |
+
self.id_ = id(self)
|
30 |
+
self.path = path
|
31 |
+
self.paragraphs = [Paragraph(xp, self.id_, i, self) for (i, xp) in enumerate(self.xdoc.paragraphs)]
|
32 |
+
self.requirements_paragraphs = WordReader(self.path).paragraphs if not "data/templates" in self.path else []
|
33 |
+
self.handle_content_before_toc()
|
34 |
+
self.container = Container(self.paragraphs, father=self)
|
35 |
+
self.container_requirements = Container_requirements(self.requirements_paragraphs, father=self)
|
36 |
+
set_indexes(self.container, self.path)
|
37 |
+
set_indexes(self.container_requirements, self.path)
|
38 |
+
self.styles = Styles(self.xdoc.styles)
|
39 |
+
self.tasks = [c.get_fulltask(self.container.one_liner) for c in self.container.containers if c.task]
|
40 |
+
self.blocks = self.get_blocks()
|
41 |
+
self.blocks_requirements = self.get_blocks_requirements()
|
42 |
+
|
43 |
+
|
44 |
+
def copy(self, new_doc_path):
|
45 |
+
shutil.copyfile(self.path, new_doc_path)
|
46 |
+
new_doc = Doc(new_doc_path)
|
47 |
+
new_doc.save_as_docx(new_doc_path)
|
48 |
+
return new_doc
|
49 |
+
|
50 |
+
def clear(self):
|
51 |
+
os.remove(self.path)
|
52 |
+
|
53 |
+
def apply_template(self, template, options_list):
|
54 |
+
"""
|
55 |
+
TODO: mettre le texte dans un fichier de config
|
56 |
+
"""
|
57 |
+
log = []
|
58 |
+
j = 0
|
59 |
+
if ("Justifier le texte (Normal)" in options_list):
|
60 |
+
log.append("Le contenu du document a été justifié")
|
61 |
+
self.justify_content()
|
62 |
+
self.save_as_docx()
|
63 |
+
if("Recentrer les tableaux" in options_list):
|
64 |
+
j = self.center_tables()
|
65 |
+
log.append(f"{j} table{'s' if j>1 else ''} centrée{'s' if j>1 else ''}")
|
66 |
+
self.save_as_docx()
|
67 |
+
log.append(f"Le template {template.name} a été ajouté avant le document")
|
68 |
+
self.rearrange_tables()
|
69 |
+
self.save_as_docx()
|
70 |
+
log = self.styles.apply_from(template.styles, log)
|
71 |
+
self.save_as_docx()
|
72 |
+
self.delete_toc(template)
|
73 |
+
self.normal_style_for_empty_paragraphs()
|
74 |
+
self.save_as_docx()
|
75 |
+
self.append_doc_to_template_and_update_toc(template)
|
76 |
+
return log
|
77 |
+
|
78 |
+
def copy_one_style(self, src_style_name: str, dest_style_name: str, template):
|
79 |
+
style_dest = template.styles.get_style_from_name(dest_style_name)
|
80 |
+
src_style = self.styles.get_style_from_name(src_style_name)
|
81 |
+
if src_style:
|
82 |
+
log = self.styles.copy_one_style(src_style, style_dest)
|
83 |
+
return log
|
84 |
+
else:
|
85 |
+
return None
|
86 |
+
|
87 |
+
def get_different_styles_with_template(self, template):
|
88 |
+
styles_used_in_doc = self.get_all_styles_used_in_doc_except_list()
|
89 |
+
different_styles = get_difference_with_template(styles_used_in_doc, template)
|
90 |
+
return different_styles
|
91 |
+
|
92 |
+
def save_as_docx(self, path: str = ''):
|
93 |
+
path = path if path else self.path
|
94 |
+
self.path = path
|
95 |
+
self.xdoc.save(path)
|
96 |
+
|
97 |
+
def get_blocks(self):
|
98 |
+
|
99 |
+
"""
|
100 |
+
TODO: do a function that determines if the Doc is not a template nor a generated doc
|
101 |
+
TODO: merge the two functions for getting blocks
|
102 |
+
TODO: why do we need two functions? in the end, we need only
|
103 |
+
"""
|
104 |
+
if "temp/generated_files" in self.path or "data/templates" in self.path:
|
105 |
+
return
|
106 |
+
|
107 |
+
def from_list_to_str(index_list):
|
108 |
+
index_str = str(index_list[0])
|
109 |
+
for el in index_list[1:]:
|
110 |
+
index_str += '.' + str(el)
|
111 |
+
return index_str
|
112 |
+
|
113 |
+
blocks = self.container.blocks
|
114 |
+
for block in blocks:
|
115 |
+
block.doc = self.title
|
116 |
+
block.index = from_list_to_str(block.index)
|
117 |
+
return blocks
|
118 |
+
|
119 |
+
|
120 |
+
def get_blocks_requirements(self):
|
121 |
+
if "temp/generated_files" in self.path or "data/templates" in self.path:
|
122 |
+
return
|
123 |
+
|
124 |
+
def from_list_to_str(index_list):
|
125 |
+
index_str = str(index_list[0])
|
126 |
+
for el in index_list[1:]:
|
127 |
+
index_str += '.' + str(el)
|
128 |
+
return index_str
|
129 |
+
|
130 |
+
blocks = self.container_requirements.blocks
|
131 |
+
for block in blocks:
|
132 |
+
block.doc = self.title
|
133 |
+
block.index = from_list_to_str(block.index) if not isinstance(block.index, str) else block.index
|
134 |
+
# print(f"{block.index}: {block.content}")
|
135 |
+
# print("--------------------------------------------------")
|
136 |
+
return blocks
|
137 |
+
|
138 |
+
@property
|
139 |
+
def toc(self):
|
140 |
+
"""
|
141 |
+
return the paragraphs that are in the table of contents
|
142 |
+
"""
|
143 |
+
return [p for p in self.paragraphs if p.toc]
|
144 |
+
|
145 |
+
@property
|
146 |
+
def structure(self):
|
147 |
+
return self.container.structure
|
148 |
+
|
149 |
+
def replace_tasks(self, resolutions: [str]):
|
150 |
+
if len(resolutions) == len(self.tasks): # exception to be handled
|
151 |
+
p_tasks = [p for p in self.paragraphs if p.type == 'task']
|
152 |
+
for p, r in zip(p_tasks, resolutions):
|
153 |
+
p.set_text(r)
|
154 |
+
else:
|
155 |
+
print(f"résolutions : {len(resolutions)} != {len(self.tasks)} tasks")
|
156 |
+
return self
|
157 |
+
|
158 |
+
def get_paragraphs(self):
|
159 |
+
return self.container.all_paragraphs
|
160 |
+
|
161 |
+
def get_text_from_paragraphs(self):
|
162 |
+
return [p.text for p in self.paragraphs]
|
163 |
+
|
164 |
+
def check_document(self):
|
165 |
+
"""
|
166 |
+
debugging function to analyse the doc
|
167 |
+
"""
|
168 |
+
picCount = 0
|
169 |
+
tabCount = 0
|
170 |
+
for paragraph in self.xdoc.paragraphs:
|
171 |
+
if picCount < len(self.xdoc.inline_shapes):
|
172 |
+
print('\033[1mPicture \033[0m')
|
173 |
+
picCount += 1
|
174 |
+
elif paragraph.text:
|
175 |
+
print(paragraph.text)
|
176 |
+
elif tabCount < len(self.xdoc.tables):
|
177 |
+
table = self.xdoc.tables[tabCount]
|
178 |
+
data = []
|
179 |
+
keys = None
|
180 |
+
for i, row in enumerate(table.rows):
|
181 |
+
text = (cell.text for cell in row.cells)
|
182 |
+
if i == 0:
|
183 |
+
keys = tuple(text)
|
184 |
+
continue
|
185 |
+
row_data = dict(zip(keys, text))
|
186 |
+
data.append(row_data)
|
187 |
+
print('\033[1mTable:\033[0m', data)
|
188 |
+
tabCount += 1
|
189 |
+
else:
|
190 |
+
print('\033[1mEmpty paragraph\033[0m')
|
191 |
+
|
192 |
+
|
193 |
+
def center_tables(self):
|
194 |
+
j = 0
|
195 |
+
for table in self.xdoc.tables:
|
196 |
+
j += 1
|
197 |
+
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
198 |
+
return j
|
199 |
+
|
200 |
+
def rearrange_tables(self):
|
201 |
+
"""
|
202 |
+
Hotfix for autofit.
|
203 |
+
directly from XML
|
204 |
+
"""
|
205 |
+
for t_idx, _ in enumerate(self.xdoc.tables):
|
206 |
+
self.xdoc.tables[t_idx].autofit = True
|
207 |
+
self.xdoc.tables[t_idx].allow_autofit = True
|
208 |
+
self.xdoc.tables[t_idx]._tblPr.xpath("./w:tblW")[0].attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type"] = "auto"
|
209 |
+
for row_idx, _ in enumerate(self.xdoc.tables[t_idx].rows):
|
210 |
+
for cell_idx, _ in enumerate(self.xdoc.tables[t_idx].rows[row_idx].cells):
|
211 |
+
self.xdoc.tables[t_idx].rows[row_idx].cells[cell_idx]._tc.tcPr.tcW.type = 'auto'
|
212 |
+
self.xdoc.tables[t_idx].rows[row_idx].cells[cell_idx]._tc.tcPr.tcW.w = 0
|
213 |
+
|
214 |
+
def center_images(self):
|
215 |
+
"""
|
216 |
+
works only for images in the run
|
217 |
+
"""
|
218 |
+
for paragraph in self.paragraphs:
|
219 |
+
paragraph.center_paragraph()
|
220 |
+
|
221 |
+
def justify_content(self):
|
222 |
+
"""
|
223 |
+
applied only to normal style
|
224 |
+
"""
|
225 |
+
for paragraph in self.paragraphs:
|
226 |
+
paragraph.justify_paragraph()
|
227 |
+
|
228 |
+
def number_images_in_doc(self):
|
229 |
+
"""
|
230 |
+
for debug = not used
|
231 |
+
"""
|
232 |
+
picCount = 0
|
233 |
+
for _ in self.xdoc.paragraphs:
|
234 |
+
if picCount < len(self.xdoc.inline_shapes):
|
235 |
+
print('\033[1mPicture \033[0m')
|
236 |
+
picCount += 1
|
237 |
+
return picCount
|
238 |
+
|
239 |
+
def get_all_styles_used_in_doc_except_list(self):
|
240 |
+
return self.container.get_all_styles_used_in_doc_except_list()
|
241 |
+
|
242 |
+
def get_list_styles(self):
|
243 |
+
return self.container.get_list_styles()
|
244 |
+
|
245 |
+
def retrieve_number_of_misapplied_styles(self):
|
246 |
+
return self.container.retrieve_number_of_misapplied_styles()
|
247 |
+
|
248 |
+
def normal_style_for_empty_paragraphs(self):
|
249 |
+
for p in self.paragraphs:
|
250 |
+
if p.blank and not p.toc:
|
251 |
+
p.set_style(self.styles.get_style_from_name("Normal"))
|
252 |
+
self.save_as_docx()
|
253 |
+
|
254 |
+
|
255 |
+
def append_doc_to_template_and_update_toc(self,template):
|
256 |
+
"""
|
257 |
+
TODO: rename Document_compose into XDocument
|
258 |
+
Document_compose = plain old Document from docx
|
259 |
+
Composer = from docxcompose => allows to modify several documents
|
260 |
+
"""
|
261 |
+
master = Document_compose(template.path)
|
262 |
+
composer = Composer(master)
|
263 |
+
doc = Document_compose(self.path)
|
264 |
+
composer.append(doc)
|
265 |
+
composer.save(self.path)
|
266 |
+
new_doc = Doc(self.path)
|
267 |
+
update_table_of_contents(new_doc.xdoc)
|
268 |
+
new_doc.save_as_docx()
|
269 |
+
|
270 |
+
def delete_content_before_toc(self):
|
271 |
+
"""
|
272 |
+
TODO: loop with paragraph (ours)
|
273 |
+
"""
|
274 |
+
if self.contains_toc():
|
275 |
+
for line in self.xdoc.paragraphs:
|
276 |
+
if "toc" in line.style.name:
|
277 |
+
break
|
278 |
+
if len(line.text) == 0:
|
279 |
+
self.delete_paragraph(line)
|
280 |
+
self.paragraphs.pop(0)
|
281 |
+
continue
|
282 |
+
if 'toc' not in line.style.name:
|
283 |
+
self.delete_paragraph(line)
|
284 |
+
self.paragraphs.pop(0)
|
285 |
+
self.save_as_docx()
|
286 |
+
|
287 |
+
def delete_paragraph(self, paragraph):
|
288 |
+
"""
|
289 |
+
TODO: to be put in paragraph
|
290 |
+
"""
|
291 |
+
p = paragraph._element
|
292 |
+
p.getparent().remove(p)
|
293 |
+
paragraph._p = paragraph._element = None
|
294 |
+
|
295 |
+
def delete_toc(self,template):
|
296 |
+
"""
|
297 |
+
TODO: loop with paragraph (ours)
|
298 |
+
"""
|
299 |
+
index_to_insert = None
|
300 |
+
for index, p in enumerate(template.paragraphs):
|
301 |
+
index_to_insert = index
|
302 |
+
if ("table des matières" or "table of contents") in p.text.lower():
|
303 |
+
index_to_insert += 1
|
304 |
+
break
|
305 |
+
xparagraphs_toc = [p.xparagraph for p in self.toc]
|
306 |
+
for p in xparagraphs_toc:
|
307 |
+
self.delete_paragraph(p)
|
308 |
+
self.paragraphs.pop(0)
|
309 |
+
self.save_as_docx()
|
310 |
+
|
311 |
+
|
312 |
+
def insert_table_of_content(self,index):
|
313 |
+
"""
|
314 |
+
To create a TOC (not used here)
|
315 |
+
"""
|
316 |
+
paragraph = self.xdoc.paragraphs[index].insert_paragraph_before("", "Normal")
|
317 |
+
paragraph.paragraph_format.space_before = Inches(0)
|
318 |
+
paragraph.paragraph_format.space_after = Inches(0)
|
319 |
+
run = paragraph.add_run()
|
320 |
+
|
321 |
+
fldChar = OxmlElement('w:fldChar') # creates a new element
|
322 |
+
fldChar.set(qn('w:fldCharType'), 'begin') # sets attribute on element
|
323 |
+
|
324 |
+
instrText = OxmlElement('w:instrText')
|
325 |
+
instrText.set(qn('xml:space'), 'preserve') # sets attribute on element
|
326 |
+
instrText.text = 'TOC \\o "1-5" \\h \\z \\u' # change 1-3 depending on heading levels you need
|
327 |
+
|
328 |
+
fldChar2 = OxmlElement('w:fldChar')
|
329 |
+
fldChar2.set(qn('w:fldCharType'), 'separate')
|
330 |
+
|
331 |
+
fldChar3 = OxmlElement('w:t')
|
332 |
+
fldChar3.text = "Right-click to update field."
|
333 |
+
fldChar3 = OxmlElement('w:updateFields')
|
334 |
+
fldChar3.set(qn('w:val'), 'true')
|
335 |
+
fldChar2.append(fldChar3)
|
336 |
+
|
337 |
+
fldChar4 = OxmlElement('w:fldChar')
|
338 |
+
fldChar4.set(qn('w:fldCharType'), 'end')
|
339 |
+
|
340 |
+
r_element = run._r
|
341 |
+
r_element.append(fldChar)
|
342 |
+
r_element.append(instrText)
|
343 |
+
r_element.append(fldChar2)
|
344 |
+
r_element.append(fldChar4)
|
345 |
+
|
346 |
+
p_element = paragraph._p
|
347 |
+
print(p_element.xml)
|
348 |
+
|
349 |
+
|
350 |
+
def contains_toc(self):
|
351 |
+
body_elements = self.xdoc._body._body
|
352 |
+
#extract those wrapped in <w:r> tag
|
353 |
+
rs = body_elements.xpath('.//w:r')
|
354 |
+
#check if style is hyperlink (toc)
|
355 |
+
table_of_content = []
|
356 |
+
for r in rs:
|
357 |
+
if r.style:
|
358 |
+
if "hyperlink" in r.style.lower() or "lienhypertexte" in r.style.lower():
|
359 |
+
table_of_content.append(r.text)
|
360 |
+
if len(table_of_content) > 0:
|
361 |
+
return True
|
362 |
+
else:
|
363 |
+
return False
|
364 |
+
|
365 |
+
def handle_content_before_toc(self):
|
366 |
+
"""
|
367 |
+
TODO: use a function to determine the type of the doc
|
368 |
+
"""
|
369 |
+
if not "data/templates" in self.path and not "temp/generated_files" in self.path: #PREMIER PROBLEME
|
370 |
+
self.delete_content_before_toc()
|
371 |
+
|
372 |
+
|
373 |
+
def delete_style(self, style_name):
|
374 |
+
self.styles.delete_style(style_name)
|
375 |
+
self.save_as_docx()
|
376 |
+
|
377 |
+
def change_bullet_style(self, style_name, template_style_name, template) -> {}:
|
378 |
+
"""
|
379 |
+
TODO: recode to respect the OOP
|
380 |
+
suppression of a paragraph with a bullet and rewriting of the bullet with style_name in the target styple (template_style_name)
|
381 |
+
real_style_name = core style name with no indentation
|
382 |
+
level = indentation level
|
383 |
+
"""
|
384 |
+
i = 0
|
385 |
+
real_style_name = style_name.split(' : ')[0]
|
386 |
+
level = int(style_name.split(' = ')[1])
|
387 |
+
while i < len(self.xdoc.paragraphs):
|
388 |
+
para = self.xdoc.paragraphs[i]
|
389 |
+
if real_style_name == para.style.name and self.paragraphs[i].is_list and self.paragraphs[i].list_indentation == level:
|
390 |
+
#print xml of paragraph and retrieve the level
|
391 |
+
self.delete_paragraph(self.xdoc.paragraphs[i])
|
392 |
+
self.paragraphs.pop(i)
|
393 |
+
if i == len(self.xdoc.paragraphs):
|
394 |
+
paragraph_inserted = self.xdoc.add_paragraph(para.text, style=template.styles.get_style_from_name(template_style_name))
|
395 |
+
self.paragraphs.insert(i, Paragraph(paragraph_inserted, self.id_, i, self))
|
396 |
+
else:
|
397 |
+
paragraph_inserted = self.xdoc.paragraphs[i].insert_paragraph_before(para.text, style=template.styles.get_style_from_name(template_style_name))
|
398 |
+
self.paragraphs.insert(i, Paragraph(paragraph_inserted, self.id_, i, self))
|
399 |
+
i += 1
|
400 |
+
log_dict = self.change_bullet_style_in_tables(style_name, template_style_name, template)
|
401 |
+
self.save_as_docx()
|
402 |
+
return log_dict
|
403 |
+
|
404 |
+
def change_bullet_style_in_tables(self, style_name, template_style_name, template) -> {}:
|
405 |
+
"""
|
406 |
+
same as abobe
|
407 |
+
TODO: ... same as above
|
408 |
+
"""
|
409 |
+
i = 0
|
410 |
+
real_style_name = style_name.split(' : ')[0]
|
411 |
+
level = int(style_name.split(' = ')[1])
|
412 |
+
for table in self.xdoc.tables:
|
413 |
+
for row in table.rows:
|
414 |
+
for cell in row.cells:
|
415 |
+
i = 0
|
416 |
+
for para in cell.paragraphs:
|
417 |
+
real_para = Paragraph(para, self.id_, i, self)
|
418 |
+
if real_style_name == para.style.name and real_para.is_list and real_para.list_indentation == level:
|
419 |
+
self.delete_paragraph(para)
|
420 |
+
if i == len(cell.paragraphs):
|
421 |
+
cell.add_paragraph(real_para.text, style=template.styles.get_style_from_name(template_style_name))
|
422 |
+
else:
|
423 |
+
cell.paragraphs[i].insert_paragraph_before(real_para.text, style=template.styles.get_style_from_name(template_style_name))
|
424 |
+
i += 1
|
425 |
+
log = f"Le style {style_name} a été changé en {template_style_name}"
|
426 |
+
log_dict = {'list_mapping': log}
|
427 |
+
return log_dict
|
428 |
+
|
429 |
+
def table_insertion(self, paragraph: Paragraph, content: dict):
|
430 |
+
#the content is the content of the table with the following format:
|
431 |
+
#content = {
|
432 |
+
# "header": ["header1", "header2", "header3"],
|
433 |
+
# "rows": [
|
434 |
+
# ["row1", "row1", "row1"],
|
435 |
+
# ["row2", "row2", "row2"],
|
436 |
+
# ["row3", "row3", "row3"],
|
437 |
+
# ]
|
438 |
+
#}
|
439 |
+
self.xdoc.add_table(rows = len(content["rows"]) + 1, cols = len(content["headers"]))
|
440 |
+
#Normal table default style
|
441 |
+
table = self.xdoc.tables[-1]
|
442 |
+
#add the header
|
443 |
+
for i, header in enumerate(content["headers"]):
|
444 |
+
table.cell(0, i).text = header
|
445 |
+
#add the rows
|
446 |
+
for i, row in enumerate(content["rows"]):
|
447 |
+
for j, cell in enumerate(row):
|
448 |
+
table.cell(i+1, j).text = cell
|
449 |
+
#insert the table after the paragraph
|
450 |
+
self.move_table_after(table, paragraph.xparagraph)
|
451 |
+
self.rearrange_tables()
|
452 |
+
#save the doc
|
453 |
+
self.save_as_docx()
|
454 |
+
return table
|
455 |
+
|
456 |
+
def delete_table(self, table):
|
457 |
+
table._element.getparent().remove(table._element)
|
458 |
+
table._element = table._row = None
|
459 |
+
self.save_as_docx()
|
460 |
+
|
461 |
+
def move_table_after(self, table, paragraph):
|
462 |
+
tbl, p = table._tbl, paragraph._p
|
463 |
+
p.addnext(tbl)
|
464 |
+
|
465 |
+
|
466 |
+
def remove_all_but_last_section(self):
|
467 |
+
"""
|
468 |
+
not used
|
469 |
+
"""
|
470 |
+
sectPrs = self.xdoc._element.xpath(".//w:pPr/w:sectPr")
|
471 |
+
for sectPr in sectPrs:
|
472 |
+
print(sectPr)
|
473 |
+
sectPr.getparent().remove(sectPr)
|
src/domain/paragraph.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
3 |
+
from src.tools.paragraph_tools import find_list_indentation_level
|
4 |
+
|
5 |
+
|
6 |
+
INFINITE = 10000
|
7 |
+
|
8 |
+
class Paragraph:
|
9 |
+
|
10 |
+
def __init__(self, xparagraph, doc_id: int, id_: int, doc):
|
11 |
+
|
12 |
+
self.doc = doc
|
13 |
+
self.xparagraph = xparagraph
|
14 |
+
self.is_template_para = False if not "data/templates" in self.doc.path else True
|
15 |
+
self.id_ = int(str(2) + str(doc_id) + str(id_))
|
16 |
+
self.style_name = self.xparagraph.style.name
|
17 |
+
self.is_list, self.list_indentation = find_list_indentation_level(self.xparagraph, self.doc) if not self.is_template_para else (False, 0)
|
18 |
+
self.level = self.get_level_from_name(self.style_name)
|
19 |
+
self.is_structure = self.level < INFINITE
|
20 |
+
self.text = self.xparagraph.text
|
21 |
+
self.type, self.parsed_text = self.parse_text()
|
22 |
+
|
23 |
+
@property
|
24 |
+
def style_misapplied(self):
|
25 |
+
"""
|
26 |
+
function bugged, not used
|
27 |
+
"""
|
28 |
+
#check if the actual paragraph style properties are the same as the style itself
|
29 |
+
#if not, the style is misapplied
|
30 |
+
first_run_style = [run.style.font for run in self.xparagraph.runs]
|
31 |
+
first_run_style = first_run_style[0] if first_run_style else None
|
32 |
+
if not first_run_style:
|
33 |
+
return False
|
34 |
+
doc_style = self.doc.styles.get_style_from_name(self.style_name)
|
35 |
+
if first_run_style.size != doc_style.font.size:
|
36 |
+
return True
|
37 |
+
if first_run_style.name != doc_style.font.name:
|
38 |
+
return True
|
39 |
+
if first_run_style.bold != doc_style.font.bold:
|
40 |
+
return True
|
41 |
+
if first_run_style.italic != doc_style.font.italic:
|
42 |
+
return True
|
43 |
+
if first_run_style.underline != doc_style.font.underline:
|
44 |
+
return True
|
45 |
+
if first_run_style.all_caps != doc_style.font.all_caps:
|
46 |
+
return True
|
47 |
+
if first_run_style.color.rgb != doc_style.font.color.rgb:
|
48 |
+
return True
|
49 |
+
return False
|
50 |
+
|
51 |
+
|
52 |
+
@property
|
53 |
+
def structure(self):
|
54 |
+
structure = {str(self.id_): {
|
55 |
+
'index': str(self.id_),
|
56 |
+
'canMove': True,
|
57 |
+
'isFolder': False,
|
58 |
+
'children': [],
|
59 |
+
'title': self.text,
|
60 |
+
'canRename': True,
|
61 |
+
'data': {},
|
62 |
+
'level': self.level,
|
63 |
+
}}
|
64 |
+
return structure
|
65 |
+
|
66 |
+
@property
|
67 |
+
def blank(self):
|
68 |
+
"""
|
69 |
+
checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
|
70 |
+
"""
|
71 |
+
text = self.text.replace('\n', '')
|
72 |
+
return set(text).isdisjoint(string.ascii_letters)
|
73 |
+
|
74 |
+
@property
|
75 |
+
def toc(self):
|
76 |
+
"""
|
77 |
+
Check if the paragraph is part of the table of contents
|
78 |
+
"""
|
79 |
+
return "toc" in self.style_name
|
80 |
+
|
81 |
+
@staticmethod
|
82 |
+
def get_level_from_name(style_name: str) -> int:
|
83 |
+
level = INFINITE
|
84 |
+
if 'Titre' in style_name or 'Heading' in style_name:
|
85 |
+
suffix = style_name[-1]
|
86 |
+
try:
|
87 |
+
level = int(suffix)
|
88 |
+
except:
|
89 |
+
pass
|
90 |
+
return level
|
91 |
+
|
92 |
+
|
93 |
+
def parse_text(self) -> (str, str):
|
94 |
+
|
95 |
+
if self.is_structure:
|
96 |
+
return 'structure', self.text
|
97 |
+
|
98 |
+
startswith = {"?? ": "task", "++ ": "comment"}
|
99 |
+
for start in startswith.keys():
|
100 |
+
split = self.text.rsplit(start)
|
101 |
+
if 1 < len(split):
|
102 |
+
return startswith[start], split[1]
|
103 |
+
|
104 |
+
return "normal", self.text
|
105 |
+
|
106 |
+
def set_text(self, text: str):
|
107 |
+
self.text = text
|
108 |
+
self.xparagraph.text = text
|
109 |
+
return self
|
110 |
+
|
111 |
+
def center_paragraph(self):
|
112 |
+
if self.contains_image():
|
113 |
+
self.xparagraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
114 |
+
|
115 |
+
def justify_paragraph(self):
|
116 |
+
if(self.xparagraph.style.name == "Normal"):
|
117 |
+
self.xparagraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
118 |
+
|
119 |
+
def contains_image(self) -> bool:
|
120 |
+
return any("pic:pic" in run.element.xml for run in self.xparagraph.runs)
|
121 |
+
|
122 |
+
def get_styles_in_paragraph_except_list(self):
|
123 |
+
styles = [self.xparagraph.style] if not self.is_list else []
|
124 |
+
for run in self.xparagraph.runs:
|
125 |
+
if run.style.name != "Default Paragraph Font" and run.style.name != self.xparagraph.style.name:
|
126 |
+
styles.append(run.style)
|
127 |
+
return styles
|
128 |
+
|
129 |
+
def get_list_styles(self):
|
130 |
+
styles = []
|
131 |
+
if self.is_list:
|
132 |
+
styles.append(self.xparagraph.style.name + " : indentation = " + str(self.list_indentation))
|
133 |
+
return styles
|
134 |
+
|
135 |
+
def set_style(self, style):
|
136 |
+
self.xparagraph.style = style
|
137 |
+
return self
|
138 |
+
|
139 |
+
|
140 |
+
|
src/domain/requirements_paragraphs.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
|
3 |
+
INFINITE = 10000
|
4 |
+
|
5 |
+
class Requirement_Paragraph:
|
6 |
+
def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
|
7 |
+
self.font_style = font_style
|
8 |
+
self.id_ = int(str(2)+str(page_id)+str(id_))
|
9 |
+
self.page_id = page_id
|
10 |
+
self.level = self.get_level_from_name(font_style)
|
11 |
+
self.is_structure = self.level < INFINITE
|
12 |
+
self.text = text
|
13 |
+
|
14 |
+
@property
|
15 |
+
def blank(self):
|
16 |
+
"""
|
17 |
+
checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
|
18 |
+
"""
|
19 |
+
text = self.text.replace('\n', '')
|
20 |
+
return set(text).isdisjoint(string.ascii_letters)
|
21 |
+
|
22 |
+
def rearrange_paragraph(self):
|
23 |
+
"""
|
24 |
+
rearrange the paragraph to have a better structure
|
25 |
+
"""
|
26 |
+
if self.font_style == "code":
|
27 |
+
self.text = "\n\nCode :```\n" + self.text + "\n```\n\n"
|
28 |
+
elif self.font_style == "table":
|
29 |
+
self.text = "\n\nTable :\n" + self.text + "\n\n"
|
30 |
+
return self
|
31 |
+
|
32 |
+
@staticmethod
|
33 |
+
def get_level_from_name(style_name: str) -> int:
|
34 |
+
level = INFINITE
|
35 |
+
if 'Titre' in style_name or 'Heading' in style_name:
|
36 |
+
suffix = style_name[-1]
|
37 |
+
try:
|
38 |
+
level = int(suffix)
|
39 |
+
except:
|
40 |
+
pass
|
41 |
+
return level
|
src/domain/styles.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from docx.enum.style import WD_STYLE_TYPE
|
2 |
+
from docx.shared import RGBColor
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
class Styles:
|
7 |
+
|
8 |
+
def __init__(self, xstyles, doc_id=0, id_=0):
|
9 |
+
|
10 |
+
self.id_ = int(str(doc_id)+str(id_))
|
11 |
+
self.xstyles = xstyles
|
12 |
+
self.names = [s.name for s in xstyles]
|
13 |
+
@staticmethod
|
14 |
+
def copy_style(src=None, dest=None) -> {}:
|
15 |
+
modified_style = set()
|
16 |
+
if src.type == WD_STYLE_TYPE.PARAGRAPH:
|
17 |
+
same_color = True
|
18 |
+
if src.font.color.rgb:
|
19 |
+
dest_rgb = RGBColor(src.font.color.rgb[0], src.font.color.rgb[1], src.font.color.rgb[2])
|
20 |
+
if dest.font.color.rgb:
|
21 |
+
for i in range(3):
|
22 |
+
same_color *= dest.font.color.rgb[i] == dest_rgb[i]
|
23 |
+
else:
|
24 |
+
same_color = False
|
25 |
+
dest.font.color.rgb = dest_rgb
|
26 |
+
else:
|
27 |
+
if dest.font.color.rgb:
|
28 |
+
same_color = False
|
29 |
+
if not same_color:
|
30 |
+
modified_style.add(('color', True))
|
31 |
+
|
32 |
+
if dest.font.size != src.font.size:
|
33 |
+
dest.font.size = src.font.size
|
34 |
+
modified_style.add(('font size', (src.font.size, dest.font.size)))
|
35 |
+
|
36 |
+
if dest.font.name != src.font.name:
|
37 |
+
dest.font.name = src.font.name
|
38 |
+
modified_style.add(('font', (src.font.name, dest.font.name)))
|
39 |
+
|
40 |
+
if dest.font.all_caps != src.font.all_caps:
|
41 |
+
dest.font.all_caps = src.font.all_caps
|
42 |
+
modified_style.add(('all_caps', (src.font.all_caps, dest.font.all_caps)))
|
43 |
+
|
44 |
+
if dest.font.bold != src.font.bold:
|
45 |
+
dest.font.bold = src.font.bold
|
46 |
+
modified_style.add(('bold', (src.font.bold, dest.font.bold)))
|
47 |
+
|
48 |
+
dest.font.complex_script = src.font.complex_script
|
49 |
+
dest.font.cs_bold = src.font.cs_bold
|
50 |
+
dest.font.cs_italic = src.font.cs_italic
|
51 |
+
dest.font.double_strike = src.font.double_strike
|
52 |
+
dest.font.emboss = src.font.emboss
|
53 |
+
dest.font.hidden = src.font.hidden
|
54 |
+
dest.font.highlight_color = src.font.highlight_color
|
55 |
+
dest.font.imprint = src.font.imprint
|
56 |
+
dest.font.italic = src.font.italic
|
57 |
+
dest.font.math = src.font.math
|
58 |
+
dest.font.no_proof = src.font.no_proof
|
59 |
+
dest.font.outline = src.font.outline
|
60 |
+
dest.font.rtl = src.font.rtl
|
61 |
+
dest.font.shadow = src.font.shadow
|
62 |
+
dest.font.small_caps = src.font.small_caps
|
63 |
+
dest.font.snap_to_grid = src.font.snap_to_grid
|
64 |
+
dest.font.spec_vanish = src.font.spec_vanish
|
65 |
+
dest.font.strike = src.font.strike
|
66 |
+
dest.font.subscript = src.font.subscript
|
67 |
+
dest.font.superscript = src.font.superscript
|
68 |
+
dest.font.underline = src.font.underline
|
69 |
+
dest.font.web_hidden = src.font.web_hidden
|
70 |
+
dest.base_style = src.base_style
|
71 |
+
dest.hidden = src.hidden
|
72 |
+
dest.locked = src.locked
|
73 |
+
dest.name = src.name
|
74 |
+
dest.priority = src.priority
|
75 |
+
dest.quick_style = src.quick_style
|
76 |
+
dest.unhide_when_used = src.unhide_when_used
|
77 |
+
|
78 |
+
if src.type == WD_STYLE_TYPE.LIST:
|
79 |
+
dest.hidden = src.hidden
|
80 |
+
dest.locked = src.locked
|
81 |
+
dest.name = src.name
|
82 |
+
dest.priority = src.priority
|
83 |
+
dest.quick_style = src.quick_style
|
84 |
+
dest.style_id = src.style_id
|
85 |
+
dest.unhide_when_used = src.unhide_when_used
|
86 |
+
|
87 |
+
if src.type == WD_STYLE_TYPE.TABLE:
|
88 |
+
dest.hidden = src.hidden
|
89 |
+
dest.locked = src.locked
|
90 |
+
dest.name = src.name
|
91 |
+
dest.priority = src.priority
|
92 |
+
dest.quick_style = src.quick_style
|
93 |
+
dest.unhide_when_used = src.unhide_when_used
|
94 |
+
return modified_style
|
95 |
+
|
96 |
+
|
97 |
+
def apply_from(self, template_styles, options_list):
|
98 |
+
|
99 |
+
if(options_list == []):
|
100 |
+
log = {'suppressed_styles': [], 'modified_styles': [], 'added_styles': []}
|
101 |
+
else:
|
102 |
+
log = {'options_applied': options_list,'suppressed_styles': [], 'modified_styles': [], 'added_styles': []}
|
103 |
+
|
104 |
+
for s in self.xstyles:
|
105 |
+
if s.name in template_styles.names:
|
106 |
+
src_style = template_styles.check_particular_styles(s.name)
|
107 |
+
log_s = self.copy_style(src=src_style, dest=s)
|
108 |
+
if log_s:
|
109 |
+
log['modified_styles'].append((s.name, log_s))
|
110 |
+
|
111 |
+
for s in template_styles.xstyles:
|
112 |
+
if not self.contains_style(s):
|
113 |
+
log['added_styles'].append(s.name)
|
114 |
+
self.xstyles.add_style(s.name, s.type)
|
115 |
+
self.copy_style(src=s, dest=self.xstyles[s.name])
|
116 |
+
return log
|
117 |
+
|
118 |
+
|
119 |
+
def copy_one_style(self, src_style, dest_style) -> {}:
|
120 |
+
log_msg = \
|
121 |
+
f"le style {src_style.name} a été mappé sur le style {dest_style.name} du template"
|
122 |
+
log_dict = {'style_mapping': log_msg}
|
123 |
+
self.copy_style(dest_style, src_style)
|
124 |
+
return log_dict
|
125 |
+
|
126 |
+
def get_style_from_name(self, name: str):
|
127 |
+
try:
|
128 |
+
s = self.xstyles[name]
|
129 |
+
except:
|
130 |
+
return None
|
131 |
+
return s
|
132 |
+
|
133 |
+
def contains_style(self, style):
|
134 |
+
resp = True
|
135 |
+
try:
|
136 |
+
s = self.xstyles[style.name]
|
137 |
+
except:
|
138 |
+
try:
|
139 |
+
s = self.xstyles[style.name[1:]]
|
140 |
+
except:
|
141 |
+
resp = False
|
142 |
+
return resp
|
143 |
+
|
144 |
+
def check_particular_styles(self,style_to_transform : str):
|
145 |
+
temp = style_to_transform
|
146 |
+
if re.search("^Heading [0-9]$", style_to_transform) or re.search("^Titre [0-9]$", style_to_transform):
|
147 |
+
style_to_transform = ".Titre" + style_to_transform[-1]
|
148 |
+
res = self.get_style_from_name(style_to_transform)
|
149 |
+
if res is None:
|
150 |
+
style_to_transform = ".Titre " + style_to_transform[-1]
|
151 |
+
res = self.get_style_from_name(style_to_transform)
|
152 |
+
else:
|
153 |
+
return res
|
154 |
+
if res:
|
155 |
+
return res
|
156 |
+
else:
|
157 |
+
return self.get_style_from_name(temp)
|
158 |
+
else:
|
159 |
+
return self.get_style_from_name(temp)
|
160 |
+
|
161 |
+
def delete_style(self,style_name):
|
162 |
+
self.xstyles[style_name].delete()
|
163 |
+
self.names.remove(style_name)
|
164 |
+
|
src/domain/wikidoc.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
the class works but lots of code could be reused
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
class Doc:
|
7 |
+
def __init__(self, fulltext: str = '', title: str = '', params: dict = {}):
|
8 |
+
self.params = params
|
9 |
+
self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()]
|
10 |
+
self.title, self.lines = self._get_title(title)
|
11 |
+
self.container = Container(lines=self.lines, title=self.title, father=self, params=params)
|
12 |
+
self.fulltext = fulltext
|
13 |
+
|
14 |
+
def _get_title(self, title):
|
15 |
+
lines = self.lines
|
16 |
+
if self.params['type'] == 'input_text':
|
17 |
+
if self.lines and self.lines[0] and self.lines[0].type == 'title':
|
18 |
+
title = self.lines[0].text
|
19 |
+
lines = lines[1:]
|
20 |
+
else:
|
21 |
+
title = 'the title is missing'
|
22 |
+
return title, lines
|
23 |
+
|
24 |
+
|
25 |
+
class WikiPage(Doc):
|
26 |
+
|
27 |
+
def __init__(self, fulltext='', title=''):
|
28 |
+
self.params = {
|
29 |
+
'type': 'wiki',
|
30 |
+
'startswith_':
|
31 |
+
{'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'},
|
32 |
+
'endswith_':
|
33 |
+
[' ==', ' ===', ' ====', ' =====', ' ======', ' ======'],
|
34 |
+
|
35 |
+
'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography",
|
36 |
+
"Cinematic adaptations", "Further reading", "Maps"]
|
37 |
+
}
|
38 |
+
super().__init__(fulltext=fulltext, title=title, params=self.params)
|
39 |
+
|
40 |
+
def get_paragraphs(self, chunk=500):
|
41 |
+
return self.container.get_paragraphs(chunk)
|
42 |
+
|
43 |
+
|
44 |
+
class Container:
|
45 |
+
|
46 |
+
def __init__(self, lines=[], level=0, title='', father=None, params={}):
|
47 |
+
|
48 |
+
self.children = []
|
49 |
+
self.level = level
|
50 |
+
self.title = title
|
51 |
+
self.father = father
|
52 |
+
self.lines = []
|
53 |
+
self._expand(lines)
|
54 |
+
if params and 'discarded' in params.keys():
|
55 |
+
self.children = [child for child in self.children if child.title not in params['discarded']]
|
56 |
+
self.containers = [self]
|
57 |
+
for child in self.children:
|
58 |
+
self.containers += child.containers
|
59 |
+
self.text = ''
|
60 |
+
for child in self.children:
|
61 |
+
self.text += ' ' + child.text
|
62 |
+
|
63 |
+
def _expand(self, lines):
|
64 |
+
new_child = False
|
65 |
+
new_child_lines = []
|
66 |
+
new_child_title = []
|
67 |
+
for line in lines:
|
68 |
+
if not new_child:
|
69 |
+
if line.is_structure:
|
70 |
+
new_child = True
|
71 |
+
new_child_lines = []
|
72 |
+
new_child_title = line.text
|
73 |
+
line.level = self.level + 1
|
74 |
+
else:
|
75 |
+
self.lines.append(line)
|
76 |
+
|
77 |
+
else:
|
78 |
+
if self.level + 1 < line.level or not line.is_structure:
|
79 |
+
new_child_lines.append(line)
|
80 |
+
elif self.level + 1 == line.level:
|
81 |
+
self.children.append(Container(lines=new_child_lines,
|
82 |
+
level=self.level + 1,
|
83 |
+
title=new_child_title,
|
84 |
+
father=self))
|
85 |
+
new_child_lines = []
|
86 |
+
new_child_title = line.text
|
87 |
+
if new_child:
|
88 |
+
self.children.append(Container(lines=new_child_lines,
|
89 |
+
level=self.level + 1,
|
90 |
+
title=new_child_title,
|
91 |
+
father=self))
|
92 |
+
|
93 |
+
def get_paragraphs(self, chunk=500):
|
94 |
+
if len(self.text) < chunk:
|
95 |
+
paragraphs = [self.text]
|
96 |
+
else:
|
97 |
+
paragraphs = [self.root_text]
|
98 |
+
for child in self.children:
|
99 |
+
paragraphs += child.get_paragraphs(chunk)
|
100 |
+
return paragraphs
|
101 |
+
|
102 |
+
|
103 |
+
class Line:
|
104 |
+
|
105 |
+
def __init__(self, text, params):
|
106 |
+
self.text = text
|
107 |
+
self.params = params
|
108 |
+
self.type, self.text = self._parse_text()
|
109 |
+
self.level = int(self.type) if self.type.isdigit() else -1
|
110 |
+
self.is_structure = 0 < self.level
|
111 |
+
|
112 |
+
|
113 |
+
def _parse_text(self):
|
114 |
+
def strip_text(text_, start, end):
|
115 |
+
text_ = text_.split(start)[1]
|
116 |
+
if end != "":
|
117 |
+
text_ = text_.split(end)[0]
|
118 |
+
# text += ". \n"
|
119 |
+
return text_.strip()
|
120 |
+
|
121 |
+
startswith_ = self.params['startswith_']
|
122 |
+
|
123 |
+
endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_)
|
124 |
+
types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter])
|
125 |
+
for i, starter in enumerate(startswith_.keys())
|
126 |
+
if self.text.startswith(starter)]
|
127 |
+
(text, type_) = types[0] if types else (self.text, 'normal')
|
128 |
+
return type_, text.strip()
|
src/llm/llm_tools.py
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO: add a boolean to switch llms
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
import json
|
7 |
+
import string
|
8 |
+
import openai
|
9 |
+
|
10 |
+
import wikipedia
|
11 |
+
from langchain.text_splitter import CharacterTextSplitter
|
12 |
+
from langchain.prompts import PromptTemplate
|
13 |
+
from langchain.chains import LLMChain
|
14 |
+
from src.llm.llms import openai_llm
|
15 |
+
from src.tools.wiki import Wiki
|
16 |
+
|
17 |
+
|
18 |
+
# async def get_wikilist_open_source(task: {}) -> str:
|
19 |
+
# """
|
20 |
+
# get the titles of wiki pages interesting for solving the given task
|
21 |
+
# """
|
22 |
+
|
23 |
+
# template = ("<s>[INST] Your task consists in finding the list of wikipedia page titles which provide useful content "
|
24 |
+
# " for a paragraph whose description is delimited by triple backticks.\n"
|
25 |
+
# " Make sure that you provide no more than 10 elements and that the list is actually finished."
|
26 |
+
# " Format your response as a valid JSON list of strings separated by commas.[/INST]</s>"
|
27 |
+
# " Description: ```{description}```")
|
28 |
+
|
29 |
+
# prompt = PromptTemplate(template=template, input_variables=['description'])
|
30 |
+
# llm_chain = LLMChain(llm=opensource_llm, prompt=prompt)
|
31 |
+
# response = llm_chain.run({'description': task['description']})
|
32 |
+
# llm_list = response.choices[0].message.content
|
33 |
+
# try:
|
34 |
+
# wikilist = json.loads(llm_list)
|
35 |
+
# except:
|
36 |
+
# print("json loads failed with" + llm_list)
|
37 |
+
# wikilist = list(llm_list.split(','))
|
38 |
+
|
39 |
+
# expanded_wikilist = []
|
40 |
+
|
41 |
+
# expand_factor = 2
|
42 |
+
|
43 |
+
# for wikipage in wikilist:
|
44 |
+
# expanded_wikilist += wikipedia.search(wikipage, expand_factor)
|
45 |
+
|
46 |
+
# wikilist = list(set(expanded_wikilist))
|
47 |
+
|
48 |
+
# return wikilist
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
async def get_wikilist(task: {}) -> str:
|
53 |
+
"""
|
54 |
+
get the titles of wiki pages interesting for solving the given task
|
55 |
+
"""
|
56 |
+
|
57 |
+
llm = openai_llm
|
58 |
+
template = (f"\n"
|
59 |
+
f" Your task consists in finding the list of wikipedia page titles which provide useful content "
|
60 |
+
f" for a paragraph whose description is delimited by triple backticks: ```{task['description']}```\n"
|
61 |
+
f" "
|
62 |
+
f" Make sure that you provide no more than 10 elements and that the list is actually finished."
|
63 |
+
f" Format your response as a valid JSON list of strings separated by commas.\n"
|
64 |
+
f" \n"
|
65 |
+
f" ")
|
66 |
+
|
67 |
+
#wikilist = LLMChain(llm=openai_llm, prompt=prompt).run()
|
68 |
+
llm_list = llm.invoke(template)
|
69 |
+
try:
|
70 |
+
wikilist = json.loads(llm_list)
|
71 |
+
except:
|
72 |
+
print("json loads failed with" + llm_list)
|
73 |
+
wikilist = list(llm_list.split(','))
|
74 |
+
|
75 |
+
expanded_wikilist = []
|
76 |
+
|
77 |
+
expand_factor = 2
|
78 |
+
|
79 |
+
for wikipage in wikilist:
|
80 |
+
expanded_wikilist += wikipedia.search(wikipage, expand_factor)
|
81 |
+
|
82 |
+
wikilist = list(set(expanded_wikilist))
|
83 |
+
|
84 |
+
return wikilist
|
85 |
+
|
86 |
+
|
87 |
+
def extract_list(llm_list: str):
|
88 |
+
|
89 |
+
def filter_(el: str):
|
90 |
+
resp = 2 < len(el)
|
91 |
+
usable_length = len([c for c in el if c in string.ascii_letters])
|
92 |
+
resp = resp and len(el)*3/4 < usable_length
|
93 |
+
return resp
|
94 |
+
|
95 |
+
try:
|
96 |
+
wikilist = llm_list[1:-1].split('"')
|
97 |
+
wikilist = [el for el in wikilist if filter_(el)]
|
98 |
+
print(wikilist)
|
99 |
+
except:
|
100 |
+
wikilist = []
|
101 |
+
print('issues with the wikilist')
|
102 |
+
return wikilist
|
103 |
+
|
104 |
+
|
105 |
+
# def get_public_paragraph_open_source(task: {}) -> str:
|
106 |
+
# """returns the task directly performed by chat GPT"""
|
107 |
+
|
108 |
+
# template = ("<s>[INST] Your task consists in generating a paragraph whose description is delimited by triple "
|
109 |
+
# "backticks.\n"
|
110 |
+
# " The paragraph belongs at the top level of the hierarchy to a document"
|
111 |
+
# " whose doc_description is delimited by triple backticks.\n"
|
112 |
+
# " Make sure that the paragraph relates the top level of the document\n"
|
113 |
+
# " The paragraph belongs to a higher paragraph in the hierarchy whose description (above) is delimited by "
|
114 |
+
# " triple backticks."
|
115 |
+
# " Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
|
116 |
+
# " The paragraphs comes after previous paragraphs whose description (before) is delimited by triple "
|
117 |
+
# " backticks.\n"
|
118 |
+
# " Make sure that the paragraph relates with previous paragraph without any repetition\n"
|
119 |
+
# " The paragraphs comes before next paragraphs whose description (after) is delimited by triple backticks.\n"
|
120 |
+
# " Make sure that the paragraph prepares the transition to the next paragraph without any "
|
121 |
+
# " repetition. [/INST]</s>"
|
122 |
+
# " Description: ```{description}```"
|
123 |
+
# " Doc description: ```{doc_description}```"
|
124 |
+
# " Above: ```{above}```"
|
125 |
+
# " Before: ```{before}```"
|
126 |
+
# " After: ```{after}```"
|
127 |
+
# )
|
128 |
+
|
129 |
+
# prompt = PromptTemplate(template=template, input_variables=['description', 'doc_description', 'above', 'before', 'after'])
|
130 |
+
# llm_chain = LLMChain(llm=opensource_llm, prompt=prompt)
|
131 |
+
# response = llm_chain.run({'description': task['description'], 'doc_description': task['doc_description'],
|
132 |
+
# 'above': task['above'], 'before': task['before'], 'after': task['after']})
|
133 |
+
# p = response.choices[0].message.content
|
134 |
+
# return p
|
135 |
+
|
136 |
+
def get_public_paragraph(task: {}) -> str:
|
137 |
+
"""returns the task directly performed by chat GPT"""
|
138 |
+
print(task)
|
139 |
+
llm = openai_llm
|
140 |
+
template = (f"\n"
|
141 |
+
f" Your task consists in generating a paragraph\\n"
|
142 |
+
f" whose description is delimited by triple backticks: ```{task['description']}```\n"
|
143 |
+
f"\n"
|
144 |
+
f" The paragraph belongs at the top level of the hierarchy to a document \\n"
|
145 |
+
f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n"
|
146 |
+
f" Make sure that the paragraph relates the top level of the document\n"
|
147 |
+
f" \n"
|
148 |
+
f" The paragraph belongs to a higher paragraph in the hierarchy \\n"
|
149 |
+
f" whose description is delimited by triple backticks: ``` {task['above']}```\n"
|
150 |
+
f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
|
151 |
+
f" \n"
|
152 |
+
f" The paragraphs comes after previous paragraphs \\n"
|
153 |
+
f" whose description is delimited by triple backticks: ``` {task['before']}```\n"
|
154 |
+
f" Make sure that the paragraph relates with previous paragraph without any repetition\n"
|
155 |
+
f" \n"
|
156 |
+
f" The paragraphs comes before next paragraphs \\n"
|
157 |
+
f" whose description is delimited by triple backticks: ``` {task['after']}```\n"
|
158 |
+
f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n"
|
159 |
+
f" \n"
|
160 |
+
f" \n"
|
161 |
+
f"\n"
|
162 |
+
f" ")
|
163 |
+
|
164 |
+
p = llm.invoke(template)
|
165 |
+
|
166 |
+
return p
|
167 |
+
|
168 |
+
|
169 |
+
def create_index(wikilist: [str]):
|
170 |
+
"""
|
171 |
+
useful for creating the index of wikipages
|
172 |
+
"""
|
173 |
+
fetch = Wiki().fetch
|
174 |
+
|
175 |
+
pages = [(title, fetch(title)) for title in wikilist if type(fetch(title)) != str]
|
176 |
+
texts = []
|
177 |
+
chunk = 800
|
178 |
+
for title, page in pages:
|
179 |
+
texts.append(WikiPage(title=title, fulltext=page.page_content))
|
180 |
+
|
181 |
+
doc_splitter = CharacterTextSplitter(
|
182 |
+
separator=".",
|
183 |
+
chunk_size=chunk,
|
184 |
+
chunk_overlap=100,
|
185 |
+
length_function=len,
|
186 |
+
)
|
187 |
+
|
188 |
+
paragraphs = texts[0].get_paragraphs(chunk=800)
|
189 |
+
|
190 |
+
split_texts = []
|
191 |
+
for p in paragraphs:
|
192 |
+
split_texts += doc_splitter.split_text(p)
|
193 |
+
|
194 |
+
for split_text in split_texts:
|
195 |
+
assert type(split_text) == str
|
196 |
+
assert 0 < len(split_text) < 2 * 500
|
197 |
+
|
198 |
+
wiki_index = Chroma.from_texts(split_texts)
|
199 |
+
|
200 |
+
return wiki_index
|
201 |
+
|
202 |
+
|
203 |
+
def get_wiki_paragraph(wiki_index, task: {}) -> str:
|
204 |
+
"""useful to get a summary in one line from wiki index"""
|
205 |
+
|
206 |
+
task_description = get_public_paragraph(task)
|
207 |
+
wiki_paragraphs = semantic_search(wiki_index, task_description)
|
208 |
+
text_content = ""
|
209 |
+
for p in wiki_paragraphs:
|
210 |
+
text_content += p.page_content + "/n/n"
|
211 |
+
|
212 |
+
template = (f"\n"
|
213 |
+
f" Your task consists in generating a paragraph\\n"
|
214 |
+
f" whose description is delimited by triple backticks: ```{task['description']}```\n"
|
215 |
+
f"\n"
|
216 |
+
f" The text generation is based in the documents provided in these sections \n"
|
217 |
+
f" delimited by by triple backticks: ``` {text_content}``` \n"
|
218 |
+
f" The paragraph belongs at the top level of the hierarchy to a document \\n"
|
219 |
+
f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n"
|
220 |
+
f" Make sure that the paragraph relates the top level of the document\n"
|
221 |
+
f" \n"
|
222 |
+
f" The paragraph belongs to a higher paragraph in the hierarchy \\n"
|
223 |
+
f" whose description is delimited by triple backticks: ``` {task['above']}```\n"
|
224 |
+
f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
|
225 |
+
f" \n"
|
226 |
+
f" The paragraphs comes after previous paragraphs \\n"
|
227 |
+
f" whose description is delimited by triple backticks: ``` {task['before']}```\n"
|
228 |
+
f" Make sure that the paragraph relates with previous paragraph without any repetition\n"
|
229 |
+
f" \n"
|
230 |
+
f" The paragraphs comes before next paragraphs \\n"
|
231 |
+
f" whose description is delimited by triple backticks: ``` {task['after']}```\n"
|
232 |
+
f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n"
|
233 |
+
f" \n"
|
234 |
+
f" \n"
|
235 |
+
f"\n"
|
236 |
+
f" ")
|
237 |
+
|
238 |
+
llm = openai_llm
|
239 |
+
p = llm(template)
|
240 |
+
|
241 |
+
return p
|
242 |
+
|
243 |
+
|
244 |
+
# def get_private_paragraph_open_source(texts, task: {}) -> str:
|
245 |
+
# """useful to get a summary in one line from wiki index"""
|
246 |
+
|
247 |
+
# text_content = ""
|
248 |
+
# for t in texts:
|
249 |
+
# text_content += t + "/n/n"
|
250 |
+
|
251 |
+
# template = ("\n"
|
252 |
+
# " Your task consists in generating a paragraph"
|
253 |
+
# " whose description is delimited by triple backticks\n"
|
254 |
+
# " The text generation is based in the documents provided in these sections \n"
|
255 |
+
# " delimited by by triple backticks (text_content)\n"
|
256 |
+
# " The paragraph belongs at the top level of the hierarchy to a document"
|
257 |
+
# " whose description is delimited by triple backticks (doc_decription)\n"
|
258 |
+
# " Make sure that the paragraph relates the top level of the document\n"
|
259 |
+
# " \n"
|
260 |
+
# " The paragraph belongs to a higher paragraph in the hierarchy"
|
261 |
+
# " whose description is delimited by triple backticks (above)\n"
|
262 |
+
# " Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
|
263 |
+
# " \n"
|
264 |
+
# " The paragraphs comes after previous paragraphs"
|
265 |
+
# " whose description is delimited by triple backticks (before)\n"
|
266 |
+
# " Make sure that the paragraph relates with previous paragraph without any repetition\n"
|
267 |
+
# " \n"
|
268 |
+
# " The paragraphs comes before next paragraphs"
|
269 |
+
# " whose description is delimited by triple backticks (after)\n"
|
270 |
+
# " Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n"
|
271 |
+
# " description: ```{description}```"
|
272 |
+
# " text_content: ```{text_content}```"
|
273 |
+
# " doc_description: ```{doc_description}```"
|
274 |
+
# " above: ```{above}```"
|
275 |
+
# " before: ```{before}```"
|
276 |
+
# " after: ```{after}```")
|
277 |
+
|
278 |
+
# prompt = PromptTemplate(template=template, input_variables=['description', 'text_content', 'doc_description', 'above', 'before', 'after'])
|
279 |
+
# llm_chain = LLMChain(llm=opensource_llm, prompt=prompt)
|
280 |
+
# response = llm_chain.run({'description': task['description'], 'text_content': text_content, 'doc_description': task['doc_description'],
|
281 |
+
# 'above': task['above'], 'before': task['before'], 'after': task['after']})
|
282 |
+
# p = response.choices[0].message.content
|
283 |
+
|
284 |
+
|
285 |
+
def get_private_paragraph(texts, task: {}) -> str:
|
286 |
+
"""useful to get a summary in one line from wiki index"""
|
287 |
+
|
288 |
+
text_content = ""
|
289 |
+
for t in texts:
|
290 |
+
text_content += t + "/n/n"
|
291 |
+
|
292 |
+
template = (f"\n"
|
293 |
+
f" Your task consists in generating a paragraph\\n"
|
294 |
+
f" whose description is delimited by triple backticks: ```{task['description']}```\n"
|
295 |
+
f"\n"
|
296 |
+
f" The text generation is based in the documents provided in these sections \n"
|
297 |
+
f" delimited by by triple backticks: ``` {text_content}``` \n"
|
298 |
+
f" The paragraph belongs at the top level of the hierarchy to a document \\n"
|
299 |
+
f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n"
|
300 |
+
f" Make sure that the paragraph relates the top level of the document\n"
|
301 |
+
f" \n"
|
302 |
+
f" The paragraph belongs to a higher paragraph in the hierarchy \\n"
|
303 |
+
f" whose description is delimited by triple backticks: ``` {task['above']}```\n"
|
304 |
+
f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
|
305 |
+
f" \n"
|
306 |
+
f" The paragraphs comes after previous paragraphs \\n"
|
307 |
+
f" whose description is delimited by triple backticks: ``` {task['before']}```\n"
|
308 |
+
f" Make sure that the paragraph relates with previous paragraph without any repetition\n"
|
309 |
+
f" \n"
|
310 |
+
f" The paragraphs comes before next paragraphs \\n"
|
311 |
+
f" whose description is delimited by triple backticks: ``` {task['after']}```\n"
|
312 |
+
f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n"
|
313 |
+
f" \n"
|
314 |
+
f" \n"
|
315 |
+
f"\n"
|
316 |
+
f" ")
|
317 |
+
|
318 |
+
llm = openai_llm
|
319 |
+
p = llm.invoke(template)
|
320 |
+
|
321 |
+
return p
|
322 |
+
|
323 |
+
def summarize_paragraph_v2(prompt : str, title_doc : str = '', title_para : str = ''):
|
324 |
+
max_tokens = 850
|
325 |
+
location_of_the_paragraph = prompt.split(" :")[0]
|
326 |
+
"""summarizes the paragraph"""
|
327 |
+
task = (f"Your task consists in summarizing in English the paragraph of the document untitled ```{title_doc}``` located in the ```{location_of_the_paragraph}``` section of the document."
|
328 |
+
f"The paragraph title is ```{title_para}```."
|
329 |
+
f"Your response shall be concise and shall respect the following format:"
|
330 |
+
f"<summary>"
|
331 |
+
f"If you see that the summary that you are creating will not respect ```{max_tokens}``` tokens, find a way to make it shorter.")
|
332 |
+
generation = openai.chat.completions.create(model="gpt-3.5-turbo-16k", messages=[{"role":"system","content":task},{"role":"user","content":prompt}])
|
333 |
+
res = generation.choices[0].message.content
|
334 |
+
print("****************")
|
335 |
+
print(res)
|
336 |
+
print("----")
|
337 |
+
return str(res).strip()
|
src/llm/llms.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_openai import OpenAI
|
2 |
+
from transformers import AutoModelForCausalLM
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
7 |
+
|
8 |
+
if not "OPENAI_API_KEY" in os.environ:
|
9 |
+
from config_key import OPENAI_API_KEY
|
10 |
+
|
11 |
+
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
|
12 |
+
|
13 |
+
openai_llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")
|
14 |
+
|
15 |
+
# opensource_llm = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") #LAMA MODEL
|
src/model/block.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Block:
|
2 |
+
def __init__(self, doc: str = '', title: str = '', content: str = '', content_fr: str = '',
|
3 |
+
index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
|
4 |
+
self.doc = doc
|
5 |
+
self.title = title
|
6 |
+
self.title_fr = ""
|
7 |
+
self.content = content
|
8 |
+
self.content_fr = content_fr
|
9 |
+
self.specials = []
|
10 |
+
self.index = index
|
11 |
+
self.rank = rank
|
12 |
+
self.level = level
|
13 |
+
self.distance = distance
|
14 |
+
|
15 |
+
def to_dict(self) -> {}:
|
16 |
+
block_dict = {'doc': self.doc,
|
17 |
+
'title': self.title,
|
18 |
+
'title_fr': self.title_fr,
|
19 |
+
'content': self.content,
|
20 |
+
'content_fr': self.content_fr,
|
21 |
+
'index': self.index,
|
22 |
+
'rank': self.rank,
|
23 |
+
'level': self.level,
|
24 |
+
'distance': self.distance}
|
25 |
+
for i, s in enumerate(self.specials):
|
26 |
+
special_key = 'special_'+str(i)
|
27 |
+
block_dict[special_key] = s
|
28 |
+
block_dict['specials_len'] = len(self.specials)
|
29 |
+
return block_dict
|
30 |
+
|
31 |
+
def from_dict(self, block_dict: {}):
|
32 |
+
self.doc = block_dict['doc']
|
33 |
+
self.title = block_dict['title']
|
34 |
+
self.title_fr = block_dict['title_fr']
|
35 |
+
self.content = block_dict['content']
|
36 |
+
self.content_fr = block_dict['content_fr']
|
37 |
+
self.index = block_dict['index']
|
38 |
+
self.rank = block_dict['rank']
|
39 |
+
self.level = block_dict['level']
|
40 |
+
self.distance = block_dict['distance']
|
41 |
+
self.specials = []
|
42 |
+
for i in range(block_dict['specials_len']):
|
43 |
+
special_key = 'special_' + str(i)
|
44 |
+
self.specials.append(block_dict[special_key])
|
45 |
+
return self
|
46 |
+
|
47 |
+
@property
|
48 |
+
def distance_str(self) -> str:
|
49 |
+
return format(self.distance, '.2f')
|
src/model/container.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model.paragraph import Paragraph
|
2 |
+
from src.model.block import Block
|
3 |
+
|
4 |
+
INFINITE = 99999
|
5 |
+
|
6 |
+
|
7 |
+
class Container:
|
8 |
+
|
9 |
+
def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, index: [int] = None,
|
10 |
+
father=None, id_=0):
|
11 |
+
if index is None:
|
12 |
+
index = []
|
13 |
+
self.level = level
|
14 |
+
self.title = title
|
15 |
+
self.paragraphs = []
|
16 |
+
self.children = []
|
17 |
+
self.index = index
|
18 |
+
self.father = father # if not father, then the container is at the top of the hierarchy
|
19 |
+
self.id_ = int(str(1) + str(father.id_) + str(id_))
|
20 |
+
if paragraphs:
|
21 |
+
self.paragraphs, self.children = self.create_children(paragraphs, level, index)
|
22 |
+
self.blocks = self.get_blocks()
|
23 |
+
self.normals, self.comments, self.tasks = self.sort_paragraphs()
|
24 |
+
|
25 |
+
|
26 |
+
@property
|
27 |
+
def text(self):
|
28 |
+
text = ""
|
29 |
+
if self.title:
|
30 |
+
text = "Titre " + str(self.level) + " : " + self.title.text + '\n'
|
31 |
+
for p in self.paragraphs:
|
32 |
+
text += p.text + '\n'
|
33 |
+
for child in self.children:
|
34 |
+
text += child.text
|
35 |
+
return text
|
36 |
+
|
37 |
+
@property
|
38 |
+
def text_chunks(self, chunk=500):
|
39 |
+
text_chunks = []
|
40 |
+
text_chunk = ""
|
41 |
+
for p in self.paragraphs:
|
42 |
+
if chunk < len(text_chunk) + len(p.text):
|
43 |
+
text_chunks.append(text_chunk)
|
44 |
+
text_chunk = ""
|
45 |
+
else:
|
46 |
+
text_chunk += " " + p.text
|
47 |
+
if text_chunk and not text_chunk.isspace():
|
48 |
+
text_chunks.append(text_chunk)
|
49 |
+
for child in self.children:
|
50 |
+
text_chunks += child.text_chunks
|
51 |
+
return text_chunks
|
52 |
+
|
53 |
+
def get_blocks(self):
|
54 |
+
block = Block(level=self.level, index=self.index)
|
55 |
+
if self.title:
|
56 |
+
block.title = self.title.text
|
57 |
+
for p in self.paragraphs:
|
58 |
+
if not p.blank:
|
59 |
+
if p.text.startswith('##### '):
|
60 |
+
special_action = p.text.lstrip('##### ')
|
61 |
+
block.specials.append(special_action)
|
62 |
+
else:
|
63 |
+
block.content += p.text
|
64 |
+
blocks = [block] if block.content or block.specials else []
|
65 |
+
for child in self.children:
|
66 |
+
blocks += child.blocks
|
67 |
+
return blocks
|
68 |
+
|
69 |
+
def create_children(self, paragraphs: Paragraph, level: int, index: [int]) -> ([Paragraph], []):
|
70 |
+
"""
|
71 |
+
creates children containers or directly attached content
|
72 |
+
and returns the list of containers and contents of level+1
|
73 |
+
:return:
|
74 |
+
[Content or Container]
|
75 |
+
"""
|
76 |
+
attached_paragraphs = []
|
77 |
+
container_paragraphs = []
|
78 |
+
container_title = None
|
79 |
+
children = []
|
80 |
+
in_children = False
|
81 |
+
child_id = 0
|
82 |
+
level = INFINITE
|
83 |
+
|
84 |
+
while paragraphs:
|
85 |
+
p = paragraphs.pop(0)
|
86 |
+
if not in_children and not p.is_structure:
|
87 |
+
attached_paragraphs.append(p)
|
88 |
+
else:
|
89 |
+
in_children = True
|
90 |
+
if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
|
91 |
+
if container_paragraphs or container_title:
|
92 |
+
if level <= len(index):
|
93 |
+
index = index[:level]
|
94 |
+
index[-1] += 1
|
95 |
+
else:
|
96 |
+
for i in range(level-len(index)):
|
97 |
+
index.append(1)
|
98 |
+
children.append(Container(container_paragraphs, container_title, level, index, self, child_id))
|
99 |
+
child_id += 1
|
100 |
+
container_paragraphs = []
|
101 |
+
container_title = p
|
102 |
+
level = p.level
|
103 |
+
|
104 |
+
else: # p is normal text or strictly lower in hierarchy, then the child continues to grow
|
105 |
+
container_paragraphs.append(p)
|
106 |
+
|
107 |
+
if container_paragraphs or container_title:
|
108 |
+
if level <= len(index):
|
109 |
+
index = index[:level]
|
110 |
+
index[-1] += 1
|
111 |
+
else:
|
112 |
+
for i in range(level - len(index)):
|
113 |
+
index.append(1)
|
114 |
+
children.append(Container(container_paragraphs, container_title, level, index, self, child_id))
|
115 |
+
child_id += 1
|
116 |
+
|
117 |
+
return attached_paragraphs, children
|
118 |
+
|
119 |
+
@property
|
120 |
+
def structure(self):
|
121 |
+
|
122 |
+
self_structure = {str(self.id_): {
|
123 |
+
'index': str(self.id_),
|
124 |
+
'canMove': True,
|
125 |
+
'isFolder': True,
|
126 |
+
'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
|
127 |
+
'canRename': True,
|
128 |
+
'data': {},
|
129 |
+
'level': self.level,
|
130 |
+
'rank': self.rank,
|
131 |
+
'title': self.title.text if self.title else 'root'
|
132 |
+
}}
|
133 |
+
paragraphs_structure = [p.structure for p in self.paragraphs]
|
134 |
+
structure = [self_structure] + paragraphs_structure
|
135 |
+
for child in self.children:
|
136 |
+
structure += child.structure
|
137 |
+
return structure
|
138 |
+
|
139 |
+
def sort_paragraphs(self) -> ([Paragraph], [Paragraph], [Paragraph]):
|
140 |
+
mapping = {'normal': [], 'comment': [], 'task': []}
|
141 |
+
for p in self.paragraphs:
|
142 |
+
mapping(p.type).append(p)
|
143 |
+
return mapping['normal'], mapping['comment'], mapping['task']
|
src/model/doc.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import docx
|
2 |
+
|
3 |
+
from src.model.container import Container
|
4 |
+
from src.model.paragraph import Paragraph
|
5 |
+
|
6 |
+
|
7 |
+
class Doc:
|
8 |
+
|
9 |
+
def __init__(self, path='', id_=None):
|
10 |
+
|
11 |
+
self.xdoc = docx.Document(path)
|
12 |
+
self.title = path.split('/')[-1]
|
13 |
+
self.id_ = id(self)
|
14 |
+
self.path = path
|
15 |
+
paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)]
|
16 |
+
self.container = Container(paragraphs, father=self, level=0)
|
17 |
+
self.blocks = self.get_blocks()
|
18 |
+
self.tasks = [c.get_task(self.container.one_liner) for c in self.container.containers if c.task]
|
19 |
+
|
20 |
+
@property
|
21 |
+
def structure(self):
|
22 |
+
|
23 |
+
return self.container.structure
|
24 |
+
|
25 |
+
def get_blocks(self):
|
26 |
+
|
27 |
+
def from_list_to_str(index_list):
|
28 |
+
index_str = str(index_list[0])
|
29 |
+
for el in index_list[1:]:
|
30 |
+
index_str += '.' + str(el)
|
31 |
+
return index_str
|
32 |
+
|
33 |
+
blocks = self.container.blocks
|
34 |
+
for block in blocks:
|
35 |
+
block.doc = self.title
|
36 |
+
if block.level == 0:
|
37 |
+
blocks.remove(block)
|
38 |
+
block.index = from_list_to_str(block.index)
|
39 |
+
return blocks
|
40 |
+
"""
|
41 |
+
current_level = len(current_index)
|
42 |
+
if 0 < block.level:
|
43 |
+
if block.level == current_level:
|
44 |
+
current_index[-1] += 1
|
45 |
+
elif current_level < block.level:
|
46 |
+
current_index.append(1)
|
47 |
+
elif block.level < current_level:
|
48 |
+
current_index = current_index[:block.level]
|
49 |
+
current_index[-1] += 1
|
50 |
+
block.index = from_list_to_str(current_index)
|
51 |
+
else:
|
52 |
+
block.index = "0"
|
53 |
+
"""
|
54 |
+
|
src/model/paragraph.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
|
3 |
+
INFINITE = 10000
|
4 |
+
|
5 |
+
|
6 |
+
class Paragraph:
|
7 |
+
|
8 |
+
def __init__(self, xparagraph, doc_id: int, id_: int):
|
9 |
+
|
10 |
+
self.xparagraph = xparagraph
|
11 |
+
self.id_ = int(str(2) + str(doc_id) + str(id_))
|
12 |
+
self.level = self.get_level_from_name()
|
13 |
+
self.is_structure = self.level < INFINITE
|
14 |
+
self.text = self.xparagraph.text
|
15 |
+
self.type = self.get_type()
|
16 |
+
|
17 |
+
@property
|
18 |
+
def structure(self):
|
19 |
+
structure = {str(self.id_): {
|
20 |
+
'index': str(self.id_),
|
21 |
+
'canMove': True,
|
22 |
+
'isFolder': False,
|
23 |
+
'children': [],
|
24 |
+
'title': self.text,
|
25 |
+
'canRename': True,
|
26 |
+
'data': {},
|
27 |
+
'level': self.level,
|
28 |
+
}}
|
29 |
+
return structure
|
30 |
+
|
31 |
+
@property
|
32 |
+
def blank(self):
|
33 |
+
"""
|
34 |
+
checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
|
35 |
+
"""
|
36 |
+
text = self.text.replace('\n', '')
|
37 |
+
return set(text).isdisjoint(string.ascii_letters)
|
38 |
+
|
39 |
+
def get_level_from_name(self) -> int:
|
40 |
+
style_name = self.xparagraph.style.name
|
41 |
+
level = INFINITE
|
42 |
+
if '.Titre' in style_name:
|
43 |
+
suffix = style_name[-1]
|
44 |
+
try:
|
45 |
+
level = int(suffix)
|
46 |
+
except:
|
47 |
+
pass
|
48 |
+
return level
|
49 |
+
|
50 |
+
|
src/reader/reader_for_requirements.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import docx
|
2 |
+
import os
|
3 |
+
from docx.document import Document as _Document
|
4 |
+
from src.domain.requirements_paragraphs import Requirement_Paragraph
|
5 |
+
from docx.oxml.text.paragraph import CT_P
|
6 |
+
from docx.oxml.table import CT_Tbl
|
7 |
+
from docx.table import _Cell, Table
|
8 |
+
from docx.text.paragraph import Paragraph
|
9 |
+
|
10 |
+
class WordReader:
|
11 |
+
|
12 |
+
def __init__(self, path):
|
13 |
+
self.path = path
|
14 |
+
self.paragraphs = self.get_paragraphs()
|
15 |
+
|
16 |
+
def iter_block_items(self, parent):
|
17 |
+
if isinstance(parent, _Document):
|
18 |
+
parent_elm = parent.element.body
|
19 |
+
elif isinstance(parent, _Cell):
|
20 |
+
parent_elm = parent._tc
|
21 |
+
else:
|
22 |
+
raise ValueError("Unsupported parent type")
|
23 |
+
|
24 |
+
for child in parent_elm.iterchildren():
|
25 |
+
if isinstance(child, CT_P):
|
26 |
+
yield Paragraph(child, parent)
|
27 |
+
elif isinstance(child, CT_Tbl):
|
28 |
+
yield Table(child, parent)
|
29 |
+
|
30 |
+
def get_paragraphs(self):
|
31 |
+
if not os.path.exists(self.path):
|
32 |
+
raise FileNotFoundError(f"The file {self.path} does not exist.")
|
33 |
+
try:
|
34 |
+
doc = docx.Document(self.path)
|
35 |
+
paragraph_objects = []
|
36 |
+
paragraph_id = 0
|
37 |
+
page_id = 1 # Example page ID
|
38 |
+
total_characters = 0
|
39 |
+
for block in self.iter_block_items(doc):
|
40 |
+
if isinstance(block, Paragraph):
|
41 |
+
paragraph_info = self.extract_paragraph_info(block)
|
42 |
+
if paragraph_info: # Only append if paragraph is not empty
|
43 |
+
page_id = self.estimate_page_number(total_characters)
|
44 |
+
p_obj = Requirement_Paragraph(text=paragraph_info['text'], font_style=paragraph_info['style'], id_=paragraph_id, page_id=page_id)
|
45 |
+
#print(f"Found paragraph: {paragraph_info['style']}...") # DEBUG
|
46 |
+
paragraph_objects.append(p_obj)
|
47 |
+
paragraph_id += 1
|
48 |
+
total_characters += len(paragraph_info['text'])
|
49 |
+
elif isinstance(block, Table):
|
50 |
+
table_paragraph, table_style = self.table_to_paragraph(block)
|
51 |
+
if table_paragraph.strip(): # Check if table paragraph is not empty
|
52 |
+
#print(f"Found table. Predominant style: {table_style}") # DEBUG
|
53 |
+
p_obj = Requirement_Paragraph(text=table_paragraph, font_style=table_style, id_=paragraph_id, page_id=page_id)
|
54 |
+
paragraph_objects.append(p_obj)
|
55 |
+
paragraph_id += 1
|
56 |
+
return paragraph_objects
|
57 |
+
except Exception as e:
|
58 |
+
raise ValueError(f"Error reading the .docx file. Original error: {str(e)}")
|
59 |
+
|
60 |
+
|
61 |
+
def determine_predominant_style(self, styles):
|
62 |
+
# Count the occurrences of each style
|
63 |
+
style_counts = {}
|
64 |
+
for style in styles:
|
65 |
+
if style in style_counts:
|
66 |
+
style_counts[style] += 1
|
67 |
+
else:
|
68 |
+
style_counts[style] = 1
|
69 |
+
|
70 |
+
# Find the style with the highest count
|
71 |
+
predominant_style = max(style_counts, key=style_counts.get, default="None")
|
72 |
+
return predominant_style
|
73 |
+
|
74 |
+
def estimate_page_number(self, total_characters):
|
75 |
+
avg_chars_per_page = 2000
|
76 |
+
return total_characters // avg_chars_per_page + 1
|
77 |
+
|
78 |
+
def extract_paragraph_info(self, paragraph):
|
79 |
+
# Check if paragraph is empty
|
80 |
+
if not paragraph.text.strip():
|
81 |
+
return None # Return None for empty paragraphs
|
82 |
+
|
83 |
+
paragraph_style = paragraph.style.name if paragraph.style else 'None'
|
84 |
+
|
85 |
+
runs = []
|
86 |
+
for run in paragraph.runs:
|
87 |
+
run_details = {
|
88 |
+
'text': run.text,
|
89 |
+
'font_name': run.font.name,
|
90 |
+
'font_size': run.font.size.pt if run.font.size else None,
|
91 |
+
'bold': run.bold,
|
92 |
+
'italic': run.italic,
|
93 |
+
'underline': run.underline
|
94 |
+
}
|
95 |
+
runs.append(run_details)
|
96 |
+
|
97 |
+
return {
|
98 |
+
'text': paragraph.text,
|
99 |
+
'style': paragraph_style,
|
100 |
+
'runs': runs
|
101 |
+
}
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
def table_to_paragraph(self, table):
|
106 |
+
table_text = ""
|
107 |
+
table_styles = set()
|
108 |
+
|
109 |
+
for row in table.rows:
|
110 |
+
for cell in row.cells:
|
111 |
+
cell_text = ""
|
112 |
+
for paragraph in cell.paragraphs:
|
113 |
+
paragraph_style = paragraph.style.name if paragraph.style else 'None'
|
114 |
+
table_styles.add(paragraph_style)
|
115 |
+
|
116 |
+
for run in paragraph.runs:
|
117 |
+
cell_text += run.text
|
118 |
+
|
119 |
+
cell_text += " "
|
120 |
+
table_text += cell_text.strip() + " | " # Add a separator for cells
|
121 |
+
table_text = table_text.strip() + "\n" # Add a newline for rows
|
122 |
+
|
123 |
+
predominant_style = self.determine_predominant_style(table_styles)
|
124 |
+
|
125 |
+
return table_text.strip(), predominant_style
|
126 |
+
|
127 |
+
def print_paragraphs_and_tables(self):
|
128 |
+
try:
|
129 |
+
print("start")
|
130 |
+
doc_items = self.get_paragraphs()
|
131 |
+
for item in doc_items:
|
132 |
+
if 'paragraph' in item:
|
133 |
+
print("Paragraph:", item['paragraph']['text'])
|
134 |
+
elif 'table' in item:
|
135 |
+
print("Table:")
|
136 |
+
for row in item['table']:
|
137 |
+
for cell in row:
|
138 |
+
for paragraph in cell:
|
139 |
+
print(" Cell Paragraph:", paragraph['text'])
|
140 |
+
print('-' * 40) # separator for clarity
|
141 |
+
|
142 |
+
except Exception as e:
|
143 |
+
print(f"Error: {str(e)}")
|
src/retriever/retriever.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.domain.block import Block
|
2 |
+
from src.domain.doc import Doc
|
3 |
+
from src.llm.llm_tools import summarize_paragraph_v2
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
class Retriever:
|
7 |
+
"""
|
8 |
+
The Retriever class is responsible for processing and summarizing documents.
|
9 |
+
It supports operations such as summarizing individual blocks of text, organizing
|
10 |
+
text into a hierarchy, and conducting similarity searches within a collection of documents.
|
11 |
+
|
12 |
+
Attributes:
|
13 |
+
collection: A collection object where summaries and metadata are stored.
|
14 |
+
llmagent: An instance of LlmAgent used for generating summaries.
|
15 |
+
"""
|
16 |
+
|
17 |
+
|
18 |
+
def __init__(self, doc: Doc = None, collection=None):
|
19 |
+
"""
|
20 |
+
Initializes the Retriever class with a document, a collection, and a language model agent.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
doc: A document object containing text blocks to be processed.
|
24 |
+
collection: A collection object to store summaries and metadata.
|
25 |
+
llmagent: An instance of LlmAgent for generating summaries.
|
26 |
+
"""
|
27 |
+
|
28 |
+
if doc is not None:
|
29 |
+
self.collection = collection
|
30 |
+
blocks_good_format = doc.blocks_requirements # List of Block objects from the document.
|
31 |
+
gr.Info("Please wait while the database is being created")
|
32 |
+
|
33 |
+
# Process each block in the document.
|
34 |
+
for block in blocks_good_format:
|
35 |
+
print(f"block index : {block.index}")
|
36 |
+
# If block content is longer than 4500 characters, split and summarize separately.
|
37 |
+
if len(block.content) > 4500:
|
38 |
+
new_blocks = block.separate_1_block_in_n(max_size=4500)
|
39 |
+
for new_block in new_blocks:
|
40 |
+
summary = summarize_paragraph_v2(prompt=new_block.content, title_doc=doc.title, title_para=block.title)
|
41 |
+
if "<summary>" in summary:
|
42 |
+
summary = summary.split("<summary>")[1]
|
43 |
+
|
44 |
+
self.collection.add(
|
45 |
+
documents=[summary],
|
46 |
+
ids=[new_block.index],
|
47 |
+
metadatas=[new_block.to_dict()]
|
48 |
+
)
|
49 |
+
else:
|
50 |
+
# Summarize the block as is if it's shorter than 4500 characters.
|
51 |
+
summary = summarize_paragraph_v2(prompt=block.content, title_doc=doc.title, title_para=block.title)
|
52 |
+
if "<summary>" in summary:
|
53 |
+
summary = summary.split("<summary>")[1]
|
54 |
+
self.collection.add(
|
55 |
+
documents=[summary],
|
56 |
+
ids=[block.index],
|
57 |
+
metadatas=[block.to_dict()]
|
58 |
+
)
|
59 |
+
|
60 |
+
# Summarize blocks by their hierarchy level after individual processing.
|
61 |
+
self.summarize_by_hierarchy(blocks_good_format, doc.title)
|
62 |
+
gr.Info(f"The collection {collection.name} has been added to the database")
|
63 |
+
else:
|
64 |
+
self.collection = collection
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
def summarize_by_hierarchy(self, blocks, doc_title):
|
70 |
+
"""
|
71 |
+
Summarizes blocks based on their hierarchical levels.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
blocks: A list of Block objects to be summarized.
|
75 |
+
llmagent: An instance of LlmAgent used for generating summaries.
|
76 |
+
doc_title: The title of the document being processed.
|
77 |
+
"""
|
78 |
+
hierarchy = self.create_hierarchy(blocks)
|
79 |
+
deepest_blocks_indices = self.find_deepest_blocks(blocks)
|
80 |
+
print("Hierarchy levels identified:", hierarchy.keys())
|
81 |
+
print("Deepest block indices:", [block.index for block in deepest_blocks_indices])
|
82 |
+
|
83 |
+
for level, level_blocks in hierarchy.items():
|
84 |
+
# Summarize only if the level has more than one block and contains deepest blocks.
|
85 |
+
print(level)
|
86 |
+
print(level_blocks)
|
87 |
+
print(deepest_blocks_indices)
|
88 |
+
print(len(level_blocks))
|
89 |
+
if len(level_blocks) > 1 and any(block.index in deepest_blocks_indices for block in level_blocks):
|
90 |
+
level_content = " ".join(block.content for block in level_blocks)
|
91 |
+
|
92 |
+
print(f"Summarizing level {level} with content from blocks: {[block.index for block in level_blocks]}")
|
93 |
+
level_summary = summarize_paragraph_v2(prompt=level_content, title_doc=doc_title, title_para=f"Summary of section : {level}")
|
94 |
+
|
95 |
+
level_summary_id = f"summary_{level}"
|
96 |
+
# Initialize a new Block object with properties from the first block
|
97 |
+
|
98 |
+
first_block = level_blocks[0]
|
99 |
+
combined_block = Block(
|
100 |
+
doc=first_block.doc,
|
101 |
+
title=first_block.title,
|
102 |
+
content=" ".join(block.content for block in level_blocks),
|
103 |
+
index=first_block.index,
|
104 |
+
rank=first_block.rank,
|
105 |
+
level=first_block.level,
|
106 |
+
distance=first_block.distance
|
107 |
+
)
|
108 |
+
|
109 |
+
|
110 |
+
self.collection.add(
|
111 |
+
documents=[level_summary],
|
112 |
+
ids=[level_summary_id],
|
113 |
+
metadatas=[combined_block.to_dict()] # Pass the combined block metadata
|
114 |
+
)
|
115 |
+
# List of dictionaries, each representing a block
|
116 |
+
|
117 |
+
print(f"Added summary for level {level} to the collection.")
|
118 |
+
else:
|
119 |
+
# Skip summarization for levels that are deepest blocks.
|
120 |
+
print(f"Skipping level {level} as it is deepest blocks.")
|
121 |
+
|
122 |
+
|
123 |
+
def create_hierarchy(self, blocks):
|
124 |
+
"""
|
125 |
+
Creates a hierarchical structure of the blocks based on their indices.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
blocks: A list of Block objects to be organized into a hierarchy.
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
A dictionary representing the hierarchy of blocks.
|
132 |
+
"""
|
133 |
+
hierarchy = {}
|
134 |
+
for block in blocks:
|
135 |
+
levels = self.extract_levels(block.index)
|
136 |
+
for level in levels:
|
137 |
+
hierarchy.setdefault(level, []).append(block)
|
138 |
+
return hierarchy
|
139 |
+
|
140 |
+
|
141 |
+
def extract_levels(self, index):
|
142 |
+
"""
|
143 |
+
Extracts all hierarchical levels from a block index.
|
144 |
+
|
145 |
+
Args:
|
146 |
+
index: The index string of a block.
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
A list of levels extracted from the index.
|
150 |
+
"""
|
151 |
+
# Splits the index string and creates a list of hierarchical levels.
|
152 |
+
parts = index.split('.')
|
153 |
+
levels = ['.'.join(parts[:i]) for i in range(1, len(parts) + 1)]
|
154 |
+
return levels
|
155 |
+
|
156 |
+
|
157 |
+
def find_deepest_blocks(self, blocks):
|
158 |
+
"""
|
159 |
+
Identifies the deepest blocks in the hierarchy.
|
160 |
+
|
161 |
+
Args:
|
162 |
+
blocks: A list of Block objects.
|
163 |
+
|
164 |
+
Returns:
|
165 |
+
A set of indices representing the deepest blocks.
|
166 |
+
"""
|
167 |
+
deepest_blocks = set()
|
168 |
+
block_indices = {block.index for block in blocks}
|
169 |
+
for block in blocks:
|
170 |
+
# A block is considered deepest if no other block's index extends it.
|
171 |
+
if not any(b_index != block.index and b_index.startswith(block.index + '.') for b_index in block_indices):
|
172 |
+
deepest_blocks.add(block.index)
|
173 |
+
return deepest_blocks
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
def similarity_search(self, queries: str) -> {}:
|
178 |
+
"""
|
179 |
+
Performs a similarity search in the collection based on given queries.
|
180 |
+
|
181 |
+
Args:
|
182 |
+
queries: A string or list of strings representing the query or queries.
|
183 |
+
|
184 |
+
Returns:
|
185 |
+
A list of Block objects that are similar to the given queries.
|
186 |
+
"""
|
187 |
+
# Query the collection and retrieve blocks based on similarity.
|
188 |
+
res = self.collection.query(query_texts=queries, n_results=5)
|
189 |
+
block_dict_sources = res['metadatas'][0]
|
190 |
+
distances = res['distances'][0]
|
191 |
+
blocks = []
|
192 |
+
for bd, d in zip(block_dict_sources, distances):
|
193 |
+
b = Block().from_dict(bd)
|
194 |
+
b.distance = d
|
195 |
+
blocks.append(b)
|
196 |
+
|
197 |
+
return blocks
|
198 |
+
|
src/tools/doc_tools.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
import os
|
3 |
+
import docx
|
4 |
+
import docx.oxml.ns as ns
|
5 |
+
|
6 |
+
def get_positions(xml_file):
|
7 |
+
i = 0
|
8 |
+
width = xml_file.split('cx="')
|
9 |
+
height = xml_file.split('cy="')
|
10 |
+
while(i < len(width)):
|
11 |
+
temp = width[i].split('"')[0]
|
12 |
+
if(temp.isnumeric()):
|
13 |
+
width = temp
|
14 |
+
break
|
15 |
+
else:
|
16 |
+
i+=1
|
17 |
+
i = 0
|
18 |
+
while(i < len(height)):
|
19 |
+
temp = height[i].split('"')[0]
|
20 |
+
if(temp.isnumeric()):
|
21 |
+
height = temp
|
22 |
+
break
|
23 |
+
else:
|
24 |
+
i+=1
|
25 |
+
return width, height
|
26 |
+
|
27 |
+
def convert_to_png(imageslist):
|
28 |
+
for image in imageslist:
|
29 |
+
if(image.endswith('.png')):
|
30 |
+
continue
|
31 |
+
im = Image.open(image)
|
32 |
+
im.save(image.split('.')[0]+'.png')
|
33 |
+
imageslist[imageslist.index(image)] = image.split('.')[0]+'.png'
|
34 |
+
os.remove(image)
|
35 |
+
return imageslist
|
36 |
+
|
37 |
+
|
38 |
+
def get_difference_with_template(styles_used_in_doc, template):
|
39 |
+
styles_used_in_template = template.styles.names
|
40 |
+
different_styles = []
|
41 |
+
for style in styles_used_in_doc:
|
42 |
+
if style.name not in styles_used_in_template:
|
43 |
+
if style.name not in [s.name for s in different_styles]:
|
44 |
+
different_styles.append(style)
|
45 |
+
return different_styles
|
46 |
+
|
47 |
+
|
48 |
+
def update_table_of_contents(doc):
|
49 |
+
# Find the settings element in the document
|
50 |
+
settings_element = doc.settings.element
|
51 |
+
|
52 |
+
# Create an "updateFields" element and set its "val" attribute to "true"
|
53 |
+
update_fields_element = docx.oxml.shared.OxmlElement('w:updateFields')
|
54 |
+
update_fields_element.set(ns.qn('w:val'), 'true')
|
55 |
+
|
56 |
+
# Add the "updateFields" element to the settings element
|
57 |
+
settings_element.append(update_fields_element)
|
58 |
+
|
59 |
+
|
60 |
+
def left_part_until_number(s):
|
61 |
+
for i, char in enumerate(s):
|
62 |
+
if char.isdigit():
|
63 |
+
return s[:i]
|
64 |
+
return None
|
65 |
+
|
66 |
+
def get_title(path) -> str:
|
67 |
+
if '/' not in path and '\\' not in path:
|
68 |
+
res = path
|
69 |
+
if '/' in path:
|
70 |
+
res = path.split('/')[-1]
|
71 |
+
if '\\' in path:
|
72 |
+
res = path.split('\\')[-1]
|
73 |
+
return res
|
src/tools/index_creation.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.domain.container import Container
|
2 |
+
|
3 |
+
INFINITE = 99999
|
4 |
+
|
5 |
+
def create_dic_levels(c:Container,dict_of_levels : dict = {}):
|
6 |
+
if c.level == 0:
|
7 |
+
dict_of_levels[c.level] = [0]
|
8 |
+
for child in c.children:
|
9 |
+
if child.level not in dict_of_levels:
|
10 |
+
dict_of_levels[child.level] = [1 for _ in range(child.level)]
|
11 |
+
create_dic_levels(child, dict_of_levels)
|
12 |
+
if INFINITE in dict_of_levels.keys():
|
13 |
+
dict_of_levels[INFINITE] = [1]
|
14 |
+
return dict_of_levels
|
15 |
+
|
16 |
+
|
17 |
+
def create_good_indexes(c:Container, dict_of_levels : dict):
|
18 |
+
actual_level = c.level
|
19 |
+
c.index = dict_of_levels[actual_level].copy()
|
20 |
+
actual_len = len(dict_of_levels[actual_level])
|
21 |
+
temp_update = dict_of_levels[actual_level][-1]
|
22 |
+
dict_of_levels[actual_level][-1] += 1
|
23 |
+
for i in dict_of_levels.values():
|
24 |
+
if len(i) > actual_len:
|
25 |
+
i[actual_len - 1] = temp_update
|
26 |
+
for child in c.children:
|
27 |
+
c_lvl = child.level
|
28 |
+
for i in dict_of_levels.values():
|
29 |
+
if len(i) > c_lvl:
|
30 |
+
i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
|
31 |
+
create_good_indexes(child, dict_of_levels) # Apply the function recursively to all children
|
32 |
+
|
33 |
+
|
34 |
+
def create_good_indexes_not_ordered_titles(c:Container, dict_of_levels : dict):
|
35 |
+
actual_level = c.level
|
36 |
+
c.index = dict_of_levels[actual_level].copy()
|
37 |
+
actual_len = len(dict_of_levels[actual_level])
|
38 |
+
temp_update = dict_of_levels[actual_level][-1]
|
39 |
+
dict_of_levels[actual_level][-1] += 1
|
40 |
+
for i in dict_of_levels.values():
|
41 |
+
if len(i) > actual_len:
|
42 |
+
i[actual_len - 1] = temp_update
|
43 |
+
for child in c.children:
|
44 |
+
c_lvl = child.level
|
45 |
+
for i in dict_of_levels.values():
|
46 |
+
if len(i) > c_lvl:
|
47 |
+
i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
|
48 |
+
create_good_indexes(child, dict_of_levels) # Apply the function recursively to all children
|
49 |
+
|
50 |
+
|
51 |
+
def set_good_block_indexes(c:Container):
|
52 |
+
for i in c.containers:
|
53 |
+
for b in i.blocks:
|
54 |
+
for j in range(len(i.index)):
|
55 |
+
if i.index[j] == 0:
|
56 |
+
i.index[j] = 1
|
57 |
+
b.index = i.index
|
58 |
+
|
59 |
+
|
60 |
+
def set_indexes(c:Container, path : str):
|
61 |
+
if "temp/generated_files" in path or "data/templates" in path:
|
62 |
+
return
|
63 |
+
dict_levels = create_dic_levels(c)
|
64 |
+
myKeys = list(dict_levels.keys())
|
65 |
+
myKeys.sort()
|
66 |
+
dict_levels = {key: dict_levels[key] for key in myKeys}
|
67 |
+
if c.children and c.children[0] and (c.children[0].level > min(list(dict_levels.keys())[1:])):
|
68 |
+
c.children[0].level = min(list(dict_levels.keys())[1:])
|
69 |
+
create_good_indexes_not_ordered_titles(c, dict_levels)
|
70 |
+
else:
|
71 |
+
create_good_indexes(c, dict_levels)
|
72 |
+
set_good_block_indexes(c)
|
src/tools/list_tool.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def keep_last_occurrences(lst, key):
|
2 |
+
seen = set()
|
3 |
+
for idx in reversed(range(len(lst))):
|
4 |
+
item = lst[idx]
|
5 |
+
k = key(item)
|
6 |
+
if k in seen:
|
7 |
+
del lst[idx]
|
8 |
+
else:
|
9 |
+
seen.add(k)
|
10 |
+
return lst
|
11 |
+
|
12 |
+
def delete_duplicate_styles(list_styles_to_update, different_styles):
|
13 |
+
for s in list_styles_to_update:
|
14 |
+
for d in different_styles:
|
15 |
+
if s["doc"].name == d["doc"].name and s["list_style"].split(" : ")[0] == d["style"].name:
|
16 |
+
different_styles.remove(d)
|
17 |
+
return different_styles
|
src/tools/paragraph_tools.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from xml.etree import ElementTree as ET
|
2 |
+
|
3 |
+
def find_list_indentation_level(para, doc):
|
4 |
+
namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
5 |
+
xml_para = para._p.xml
|
6 |
+
try:
|
7 |
+
xml_numbering = doc.xdoc._part.numbering_part.element.xml
|
8 |
+
except:
|
9 |
+
return False, 0
|
10 |
+
root_para = ET.fromstring(xml_para)
|
11 |
+
root_numbering = ET.fromstring(xml_numbering)
|
12 |
+
abstract_num_reference = []
|
13 |
+
for item in root_numbering:
|
14 |
+
if item.tag == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}num":
|
15 |
+
abstract_num_reference.append(item)
|
16 |
+
is_numPr = root_para.find(".//w:numPr", namespaces=namespace)
|
17 |
+
is_style = root_para.find(".//w:pStyle", namespaces=namespace)
|
18 |
+
is_numId = root_para.find(".//w:numId", namespaces=namespace)
|
19 |
+
is_lvl = root_para.find(".//w:ilvl", namespaces=namespace)
|
20 |
+
if is_numPr != None:
|
21 |
+
if is_numId != None and is_lvl != None:
|
22 |
+
return True, int(is_lvl.attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"]) + 1
|
23 |
+
elif is_numId != None and is_lvl == None:
|
24 |
+
numId = int(is_numId.attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"])
|
25 |
+
is_abstractNumId = [item for item in abstract_num_reference if item.attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}numId"] == str(numId)][0]
|
26 |
+
numID_reference = is_abstractNumId.find(".//w:abstractNumId", namespaces=namespace).attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"]
|
27 |
+
real_numID = root_numbering.find(f".//w:abstractNum[@w:abstractNumId='{int(numID_reference)}']", namespaces=namespace)
|
28 |
+
if style_Id == None:
|
29 |
+
return False, 0
|
30 |
+
style_Id = is_style.attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"]
|
31 |
+
is_style_in_numbering = root_numbering.find(f".//w:pStyle[@w:val='{style_Id}']...", namespaces=namespace)
|
32 |
+
lvl = real_numID.find(".//w:ilvl", namespaces=namespace)
|
33 |
+
return True, int(lvl.attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ilvl"]) + 1
|
34 |
+
else:
|
35 |
+
if is_style == None:
|
36 |
+
return False, 0
|
37 |
+
else:
|
38 |
+
#check if there is a style element in the root_numbering
|
39 |
+
style_Id = is_style.attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"]
|
40 |
+
is_style_in_numbering = root_numbering.find(f".//w:pStyle[@w:val='{style_Id}']...", namespaces=namespace)
|
41 |
+
if is_style_in_numbering == None:
|
42 |
+
return False, 0
|
43 |
+
else:
|
44 |
+
ilvl = is_style_in_numbering.attrib["{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ilvl"]
|
45 |
+
return True, int(ilvl) + 1
|
src/tools/pretty_print.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.domain.block import Block
|
2 |
+
from src.domain.requirements_paragraphs import Requirement_Paragraph
|
3 |
+
|
4 |
+
def pretty_print_block_and_indexes(b : [Block]):
|
5 |
+
for block in b:
|
6 |
+
print(f"{block.index} {block.title if block.title else '___NO TITLE__'}")
|
7 |
+
print(f"----------------------------------")
|
8 |
+
|
9 |
+
def pretty_print_paragraphs(para : [Requirement_Paragraph]):
|
10 |
+
for p in para:
|
11 |
+
print(f"{p.level} --> {p.font_style} : {p.text}")
|
12 |
+
print("-------------------")
|
src/tools/semantic_db.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
from datetime import datetime
|
3 |
+
|
4 |
+
chroma_client = chromadb.Client()
|
5 |
+
|
6 |
+
|
7 |
+
def get_or_create_collection(coll_name: str):
|
8 |
+
date = coll_name[:6]
|
9 |
+
coll = chroma_client.get_or_create_collection(name=coll_name, metadata={"date": date})
|
10 |
+
return coll
|
11 |
+
|
12 |
+
|
13 |
+
def get_collection(coll_name: str):
|
14 |
+
coll = chroma_client.get_collection(name=coll_name)
|
15 |
+
return coll
|
16 |
+
|
17 |
+
|
18 |
+
def reset_collection(coll_name: str):
|
19 |
+
coll = chroma_client.get_collection(name=coll_name)
|
20 |
+
coll.delete()
|
21 |
+
return coll
|
22 |
+
|
23 |
+
|
24 |
+
def delete_old_collections(old=2):
|
25 |
+
collections = chroma_client.list_collections()
|
26 |
+
current_hour = int(datetime.now().strftime("%m%d%H"))
|
27 |
+
|
28 |
+
for coll in collections:
|
29 |
+
coll_hour = int(coll.metadata['date'])
|
30 |
+
if coll_hour < current_hour - old:
|
31 |
+
chroma_client.delete_collection(coll.name)
|
32 |
+
|
33 |
+
|
34 |
+
def add_texts_to_collection(coll_name: str, texts: [str], file: str, source: str):
|
35 |
+
"""
|
36 |
+
add texts to a collection : texts originate all from the same file
|
37 |
+
"""
|
38 |
+
coll = chroma_client.get_collection(name=coll_name)
|
39 |
+
filenames = [{file: 1, 'source': source} for _ in texts]
|
40 |
+
ids = [file+'-'+str(i) for i in range(len(texts))]
|
41 |
+
try:
|
42 |
+
coll.delete(ids=ids)
|
43 |
+
coll.add(documents=texts, metadatas=filenames, ids=ids)
|
44 |
+
except:
|
45 |
+
print(f"exception raised for collection :{coll_name}, texts: {texts} from file {file} and source {source}")
|
46 |
+
|
47 |
+
|
48 |
+
def delete_collection(coll_name: str):
|
49 |
+
chroma_client.delete_collection(name=coll_name)
|
50 |
+
|
51 |
+
|
52 |
+
def list_collections():
|
53 |
+
return chroma_client.list_collections()
|
54 |
+
|
55 |
+
|
56 |
+
def query_collection(coll_name: str, query: str, from_files: [str], n_results: int = 4):
|
57 |
+
assert 0 < len(from_files)
|
58 |
+
coll = chroma_client.get_collection(name=coll_name)
|
59 |
+
where_ = [{file: 1} for file in from_files]
|
60 |
+
where_ = where_[0] if len(where_) == 1 else {'$or': where_}
|
61 |
+
n_results_ = min(n_results, coll.count())
|
62 |
+
|
63 |
+
ans = ""
|
64 |
+
try:
|
65 |
+
ans = coll.query(query_texts=query, n_results=n_results_, where=where_)
|
66 |
+
except:
|
67 |
+
print(f"exception raised at query collection for collection {coll_name} and query {query} from files "
|
68 |
+
f"{from_files}")
|
69 |
+
|
70 |
+
return ans
|
src/tools/wiki.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
from langchain.docstore.base import Docstore
|
4 |
+
from langchain.docstore.document import Document
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
class Wiki(Docstore):
|
9 |
+
"""
|
10 |
+
Wrapper around wikipedia API.
|
11 |
+
"""
|
12 |
+
|
13 |
+
def __init__(self) -> None:
|
14 |
+
"""Check that wikipedia package is installed."""
|
15 |
+
try:
|
16 |
+
import wikipedia # noqa: F401
|
17 |
+
except ImportError:
|
18 |
+
raise ValueError(
|
19 |
+
"Could not import wikipedia python package. "
|
20 |
+
"Please install it with `pip install wikipedia`."
|
21 |
+
)
|
22 |
+
|
23 |
+
@staticmethod
|
24 |
+
def fetch(searched_page: str) -> Union[str, Document]:
|
25 |
+
"""
|
26 |
+
Try to fetch for wiki page.
|
27 |
+
|
28 |
+
If page exists, return the page summary, and a PageWithLookups object.
|
29 |
+
If page does not exist, return similar entries.
|
30 |
+
"""
|
31 |
+
import wikipedia
|
32 |
+
|
33 |
+
try:
|
34 |
+
# wikipedia.set_lang("fr")
|
35 |
+
page_content = wikipedia.page(searched_page).content
|
36 |
+
url = wikipedia.page(searched_page).url
|
37 |
+
result: Union[str, Document] = Document(
|
38 |
+
page_content=page_content, metadata={"page": url}
|
39 |
+
)
|
40 |
+
except wikipedia.PageError:
|
41 |
+
result = f"Could not find [{searched_page}]. Similar: {wikipedia.search(searched_page)}"
|
42 |
+
|
43 |
+
except wikipedia.DisambiguationError:
|
44 |
+
result = f"Could not find [{searched_page}]. Similar: {wikipedia.search(searched_page)}"
|
45 |
+
return result
|
46 |
+
|
47 |
+
def search(searched_context: str) -> [str]:
|
48 |
+
"""
|
49 |
+
Finds wiki page title in relation with the given context
|
50 |
+
"""
|
51 |
+
import wikipedia
|
52 |
+
|
53 |
+
try:
|
54 |
+
# wikipedia.set_lang("fr")
|
55 |
+
page_title_list = wikipedia.search(searched_context)
|
56 |
+
result = page_title_list
|
57 |
+
except wikipedia.PageError:
|
58 |
+
result = f"Could not find [{searched_context}]."
|
59 |
+
return result
|
60 |
+
|
61 |
+
|
src/view/log_msg.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
|
3 |
+
from config import config
|
4 |
+
|
5 |
+
|
6 |
+
def create_msg_from(logs: [Dict], docs) -> str:
|
7 |
+
log_messages = []
|
8 |
+
log_msg = config['log_msg']
|
9 |
+
docs_seen = []
|
10 |
+
msg = ''
|
11 |
+
for doc in docs:
|
12 |
+
for log in logs:
|
13 |
+
if doc.name in log.keys():
|
14 |
+
log = log[doc.name]
|
15 |
+
if 'options_applied' in log.keys():
|
16 |
+
msg += log_msg['options_applied']
|
17 |
+
for option in log['options_applied']:
|
18 |
+
msg += " - " + option + "\n"
|
19 |
+
if 'suppressed_styles' in log.keys():
|
20 |
+
if log['suppressed_styles']:
|
21 |
+
msg += log_msg['suppressed_styles']
|
22 |
+
for style_name in log['suppressed_styles']:
|
23 |
+
msg += " - " + style_name + "\n"
|
24 |
+
if log['modified_styles']:
|
25 |
+
msg += log_msg['modified_styles']
|
26 |
+
for style, log_s in log['modified_styles']:
|
27 |
+
msg += log_msg['modified_style'] + style + "\n"
|
28 |
+
for modif, _ in log_s:
|
29 |
+
msg += log_msg[modif] + ' '
|
30 |
+
msg += '\n'
|
31 |
+
if log['added_styles']:
|
32 |
+
msg += log_msg['added_styles']
|
33 |
+
for style_name in log['added_styles']:
|
34 |
+
msg += " - " + style_name + "\n"
|
35 |
+
if 'style_mapping' in log.keys():
|
36 |
+
msg = log['style_mapping']
|
37 |
+
if 'list_mapping' in log.keys():
|
38 |
+
msg = log['list_mapping']
|
39 |
+
if msg:
|
40 |
+
if doc not in docs_seen:
|
41 |
+
msg = log_msg['document'] + doc.name + '\n' + msg
|
42 |
+
docs_seen.append(doc)
|
43 |
+
log_messages.append(msg)
|
44 |
+
msg = ''
|
45 |
+
log_messages_str = '\n'.join(log_messages)
|
46 |
+
return log_messages_str
|
47 |
+
|
src/view/style_components.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
def input_files_fn1(input_files_):
|
6 |
+
update_ = {
|
7 |
+
output_files_comp: gr.update(visible=True)
|
8 |
+
} if input_files_ else {}
|
9 |
+
return update_
|
src/view/test_view.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
with gr.Blocks() as test:
|
4 |
+
list_2 = ["choix21", "choix 22", "et choix 23"]
|
5 |
+
with gr.Row():
|
6 |
+
with gr.Accordion("See Details") as grac:
|
7 |
+
gr.Markdown("lorem ipsum")
|
8 |
+
hide_btn = gr.Button("hide")
|
9 |
+
show_btn = gr.Button("show")
|
10 |
+
|
11 |
+
def hide_fn():
|
12 |
+
update_ = {
|
13 |
+
grac: gr.update(open=False)
|
14 |
+
}
|
15 |
+
return update_
|
16 |
+
|
17 |
+
def show_fn():
|
18 |
+
update_ = {
|
19 |
+
grac: gr.update(open=True)
|
20 |
+
}
|
21 |
+
return update_
|
22 |
+
|
23 |
+
hide_btn.click(hide_fn,
|
24 |
+
inputs=[],
|
25 |
+
outputs=[grac])
|
26 |
+
show_btn.click(show_fn,
|
27 |
+
inputs=[],
|
28 |
+
outputs=[grac])
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == "__main__":
|
34 |
+
test.launch()
|
src/view/view.py
ADDED
@@ -0,0 +1,533 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from typing import Dict
|
3 |
+
import asyncio
|
4 |
+
import os
|
5 |
+
from src.control.controller import Controller
|
6 |
+
from Levenshtein import distance
|
7 |
+
from src.tools.list_tool import delete_duplicate_styles
|
8 |
+
|
9 |
+
|
10 |
+
def run(config: Dict, controller: Controller):
|
11 |
+
|
12 |
+
"""
|
13 |
+
=====================================================
|
14 |
+
Global variables
|
15 |
+
================
|
16 |
+
"""
|
17 |
+
controller.clear_docs()
|
18 |
+
title = "<h1 style=text-align:center;display:block;font-size:4.5em;color:#08a2d2;font-weight:bold;margin-top:4%;padding-bottom:1%>GenProp</h1>"
|
19 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.orange)) as formatdoc:
|
20 |
+
gr.Markdown(title)
|
21 |
+
gr.Markdown("<p style=color:#08a2d2;font-size:1.5em;padding-bottom:2%;text-align:center>Par Hexamind</p>")
|
22 |
+
gr.Markdown("")
|
23 |
+
with gr.Row():
|
24 |
+
with gr.Column():
|
25 |
+
pass
|
26 |
+
with gr.Column(scale=10):
|
27 |
+
"""
|
28 |
+
=====================================================
|
29 |
+
Input and style components
|
30 |
+
==========================
|
31 |
+
"""
|
32 |
+
|
33 |
+
gr.Markdown("<p style=font-size:1em;>Vous êtes chargé de produire une proposition commerciale</p>")
|
34 |
+
|
35 |
+
|
36 |
+
with gr.Accordion("Charger votre proposition", open=True) as input_acc:
|
37 |
+
input_files_comp = gr.File(file_count="multiple", file_types=[".docx"], label="Document")
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
with gr.Accordion("Appliquer les styles", open=False) as style_acc:
|
42 |
+
templates_radio = gr.Radio(
|
43 |
+
label="Templates",
|
44 |
+
choices=config['templates'],
|
45 |
+
value=config['templates'][config['default_template_index']],
|
46 |
+
)
|
47 |
+
with gr.Row():
|
48 |
+
options_btn = gr.CheckboxGroup(choices=config['options'],
|
49 |
+
label="Options",
|
50 |
+
interactive=True)
|
51 |
+
with gr.Accordion("Mapper les styles de liste", open=False) \
|
52 |
+
as list_acc:
|
53 |
+
with gr.Column(scale=2):
|
54 |
+
list_style_comps = [gr.Dropdown(visible=False, interactive=True)
|
55 |
+
for _ in range(config['max_styles'])]
|
56 |
+
with gr.Accordion("Mapper les autres styles non présents dans le template", open=False) \
|
57 |
+
as newstyles_acc:
|
58 |
+
with gr.Column(scale=2):
|
59 |
+
newstyle_comps = [gr.Dropdown(visible=False, interactive=True)
|
60 |
+
for _ in range(config['max_styles'])]
|
61 |
+
|
62 |
+
log_comp = gr.Textbox(label="Journal des modifications", visible=False)
|
63 |
+
|
64 |
+
output_styles_files_comp = gr.File(file_count="multiple", file_types=[".docx"], visible=False)
|
65 |
+
|
66 |
+
with gr.Row():
|
67 |
+
run_style_btn = gr.Button("Appliquer le template et les modifications de style", visible=False)
|
68 |
+
clear_style_btn = gr.Button("Annuler les modifications de style", visible=False)
|
69 |
+
|
70 |
+
"""
|
71 |
+
===============================================
|
72 |
+
Generation components
|
73 |
+
======================
|
74 |
+
"""
|
75 |
+
with gr.Accordion("Compléter automatiquement la proposition", open=False) as gen_acc:
|
76 |
+
|
77 |
+
generate_option_btn = gr.Radio(
|
78 |
+
label="Automatically generate a draft based on your own database",
|
79 |
+
choices=["Auto generation", "No generation"],
|
80 |
+
value="No generation",
|
81 |
+
interactive=True,
|
82 |
+
visible=False,
|
83 |
+
)
|
84 |
+
|
85 |
+
db_list_comp = gr.CheckboxGroup(
|
86 |
+
label="Base de connaissance",
|
87 |
+
info="Ces documents constituent la source de référence. Désélectionner pour qu'ils ne soient "
|
88 |
+
"pas pris en compte lors de la génération automatiqueF",
|
89 |
+
visible=True,
|
90 |
+
interactive=True,
|
91 |
+
)
|
92 |
+
db_reset_btn = gr.Button("Effacer la base de connaissance", visible=False, size="sm") \
|
93 |
+
|
94 |
+
with gr.Column(visible=True):
|
95 |
+
gr.Markdown("<p style=font-size:1em;text-align:center;>A des fins de démonstrations, la base de connaissance est alimentée depuis Wikipedia</p>")
|
96 |
+
wiki_fetch_btn = gr.Button("Rechercher les pages Wikipedia", visible=True, size="sm")
|
97 |
+
wiki_list_comp = gr.CheckboxGroup(
|
98 |
+
label="Sélectionner les pages à ajouter dans la base de connaissance",
|
99 |
+
visible=False,
|
100 |
+
interactive=True,
|
101 |
+
)
|
102 |
+
|
103 |
+
with gr.Column():
|
104 |
+
wiki_add_to_db_btn = \
|
105 |
+
gr.Button("Ajouter les documents sélectionnés à la base de connaissance",
|
106 |
+
visible=False, size="sm")
|
107 |
+
|
108 |
+
# wiki_clear_btn = gr.Button("Effacer les choix de documents", visible=False, size="sm") \
|
109 |
+
|
110 |
+
# with gr.Tab("Depuis le disque local (en cours de développement)"):
|
111 |
+
# my_files_list_comp = gr.Files(
|
112 |
+
# label="Charger ses documents",
|
113 |
+
# visible=True,
|
114 |
+
# )
|
115 |
+
# my_files_add_to_db_btn = gr.Button("Add files to sources", visible=False, size="sm")
|
116 |
+
|
117 |
+
add_close_btn = gr.Button("Close", visible=False, size="sm")
|
118 |
+
with gr.Row():
|
119 |
+
db_add_doc_btn = gr.Button("Ajouter de nouveaux documents", visible=False, size="sm")\
|
120 |
+
|
121 |
+
output_files_comp = gr.Files(file_count="multiple", visible=False)
|
122 |
+
|
123 |
+
generate_btn = gr.Button("Générer", interactive=True)
|
124 |
+
|
125 |
+
clear_btn = gr.Button('Nettoyer', visible=False)
|
126 |
+
rerun_btn = gr.Button('Relancer', visible=False)
|
127 |
+
|
128 |
+
|
129 |
+
"""
|
130 |
+
===============================================
|
131 |
+
Verification requirements components
|
132 |
+
======================
|
133 |
+
"""
|
134 |
+
|
135 |
+
with gr.Accordion("Générer la réponse aux exigences (en cours de développement)", open=False, visible=True) as exigences_acc:
|
136 |
+
input_csv_comp = gr.File(file_count="single", file_types=[".csv", "xlsx"], visible=True, label="Fichiers d'exigences (csv, xlsx only)")
|
137 |
+
with gr.Row():
|
138 |
+
verif_btn = gr.Button("Générer la réponse aux exigences (en cours de développement)", visible=False)
|
139 |
+
output_csv_comp = gr.File(file_count="single", file_types=[".csv", "xlsx"], visible=False)
|
140 |
+
|
141 |
+
gr.Markdown("")
|
142 |
+
gr.Markdown("")
|
143 |
+
gr.Markdown("<p style=font-size:1em;>Vous êtes administrateur de GenProp</p>")
|
144 |
+
|
145 |
+
with gr.Accordion("Gérer les templates", open=False) as gestions_templates_acc:
|
146 |
+
templates_radio_modif = gr.Radio(
|
147 |
+
interactive=True,
|
148 |
+
label="Templates",
|
149 |
+
choices=config['templates'],
|
150 |
+
value=config['templates'][config['default_template_index']],
|
151 |
+
)
|
152 |
+
with gr.Row():
|
153 |
+
add_template_btn = gr.UploadButton("Ajouter un template",file_count="single", file_types=[".docx"])
|
154 |
+
delete_curr_template_btn = gr.Button("Supprimer le template sélectionné")
|
155 |
+
with gr.Accordion("Gérer la base de connaissances (en cours de développement)", open=False):
|
156 |
+
pass
|
157 |
+
|
158 |
+
with gr.Column():
|
159 |
+
pass
|
160 |
+
|
161 |
+
"""
|
162 |
+
===================================================
|
163 |
+
state variables
|
164 |
+
===============
|
165 |
+
"""
|
166 |
+
wiki_source_var: [str] = gr.State([]) # list of wikipage titles of interest for the input text tasks
|
167 |
+
wiki_db_var: [str] = gr.State([]) # list of wiki document titles in the db (as seen from the UI)
|
168 |
+
my_files_db_var: [str] = gr.State([]) # list of titles of the files uploaded in the db (as seen from the UI)
|
169 |
+
db_collection_var: str = gr.State("-1") # name of the collection of documents sources in the db # list of styles to modify
|
170 |
+
|
171 |
+
"""
|
172 |
+
===================================================
|
173 |
+
Input and styles functions and listeners
|
174 |
+
========================================
|
175 |
+
"""
|
176 |
+
|
177 |
+
def input_csv_fn(input_csv_):
|
178 |
+
if not input_csv_.name.endswith('.csv') and not input_csv_.name.endswith('.xlsx'):
|
179 |
+
raise gr.Error(f'File {input_csv_.name} is not a csv or xlsx file, please upload only csv or xlsx files')
|
180 |
+
else:
|
181 |
+
controller.set_input_csv(input_csv_)
|
182 |
+
update_ = {
|
183 |
+
verif_btn: gr.update(visible=True),
|
184 |
+
}
|
185 |
+
return update_
|
186 |
+
|
187 |
+
|
188 |
+
input_csv_comp.upload(input_csv_fn,
|
189 |
+
inputs=[input_csv_comp],
|
190 |
+
outputs=[verif_btn],
|
191 |
+
)
|
192 |
+
|
193 |
+
def input_files_upload_fn(input_files_):
|
194 |
+
for files in input_files_:
|
195 |
+
if(not files.name.endswith('.docx')):
|
196 |
+
raise gr.Error(f'File {files.name} is not a docx file, please upload only docx files')
|
197 |
+
else:
|
198 |
+
continue
|
199 |
+
controller.clear_docs()
|
200 |
+
controller.copy_docs(input_files_)
|
201 |
+
update_ = {
|
202 |
+
newstyles_acc: gr.update(open=True),
|
203 |
+
style_acc: gr.update(visible=True),
|
204 |
+
run_style_btn: gr.update(visible=True),
|
205 |
+
clear_style_btn: gr.update(visible=True),
|
206 |
+
list_acc: gr.update(open=True),
|
207 |
+
}
|
208 |
+
newstyles_update = newstyles_fn()
|
209 |
+
# misapplied_styles = misapplied_styles_fn()
|
210 |
+
# for val in misapplied_styles.values():
|
211 |
+
# if val > 0:
|
212 |
+
# doc = list(misapplied_styles.keys())[list(misapplied_styles.values()).index(val)]
|
213 |
+
# gr.Warning(f"{val} paragraphs were detected in the document {doc.name} because their styles are not well applied. Please review your document for better results.")
|
214 |
+
update_.update(newstyles_update)
|
215 |
+
return update_
|
216 |
+
|
217 |
+
input_files_comp.upload(input_files_upload_fn,
|
218 |
+
inputs=[input_files_comp],
|
219 |
+
outputs=[style_acc, newstyles_acc, run_style_btn, clear_style_btn, list_acc] + newstyle_comps + list_style_comps
|
220 |
+
)
|
221 |
+
|
222 |
+
def input_file_clear_fn():
|
223 |
+
controller.clear_docs()
|
224 |
+
update_ = {
|
225 |
+
options_btn: gr.update(value=[]),
|
226 |
+
log_comp: gr.update(value="", visible=False),
|
227 |
+
output_styles_files_comp: gr.update(value=[], visible=False),
|
228 |
+
newstyles_acc: gr.update(open=False),
|
229 |
+
style_acc: gr.update(open=False),
|
230 |
+
gen_acc: gr.update(open=False),
|
231 |
+
output_files_comp: gr.update(visible=False),
|
232 |
+
run_style_btn: gr.update(visible=False),
|
233 |
+
clear_style_btn: gr.update(visible=False),
|
234 |
+
list_acc: gr.update(open=False),
|
235 |
+
exigences_acc: gr.update(value=""),
|
236 |
+
}
|
237 |
+
newstyles_update_ = newstyles_reset()
|
238 |
+
list_style_update_ = newliststyle_reset()
|
239 |
+
update_.update(newstyles_update_)
|
240 |
+
update_.update(list_style_update_)
|
241 |
+
return update_
|
242 |
+
|
243 |
+
input_files_comp.clear(
|
244 |
+
input_file_clear_fn,
|
245 |
+
inputs=[],
|
246 |
+
outputs=[options_btn, output_styles_files_comp, output_files_comp, log_comp, newstyles_acc, list_acc,
|
247 |
+
gen_acc, style_acc, run_style_btn, clear_style_btn, exigences_acc] + newstyle_comps + list_style_comps
|
248 |
+
)
|
249 |
+
|
250 |
+
def misapplied_styles_fn():
|
251 |
+
res = controller.retrieve_number_of_misapplied_styles()
|
252 |
+
return res
|
253 |
+
|
254 |
+
def newstyles_fn():
|
255 |
+
update_ = {}
|
256 |
+
update_.update(newliststyle_reset())
|
257 |
+
update_.update(newstyles_reset())
|
258 |
+
different_styles, all_template_styles = controller.get_difference_with_template()
|
259 |
+
all_template_styles_names = [style.name for style in all_template_styles]
|
260 |
+
list_styles_to_update = controller.get_list_styles()
|
261 |
+
get_label_list = lambda i: f"style: {list_styles_to_update[i]['list_style']}"
|
262 |
+
list_style_update_ = {
|
263 |
+
list_style_comps[i]: gr.update(visible=i < len(list_styles_to_update),
|
264 |
+
choices=sorted(all_template_styles_names, key=lambda x: distance(x, list_styles_to_update[i]['list_style'])),
|
265 |
+
value=None,
|
266 |
+
label=get_label_list(i)) if i < len(list_styles_to_update) else ''
|
267 |
+
for i in range(config['max_styles'])
|
268 |
+
}
|
269 |
+
update_.update(list_style_update_)
|
270 |
+
#delete styles in different_styles that are already in list_styles_to_update
|
271 |
+
different_styles = delete_duplicate_styles(list_styles_to_update, different_styles)
|
272 |
+
adapted_template_styles = []
|
273 |
+
for i in range(len(different_styles)):
|
274 |
+
adapted_template_styles.append([style.name for style in all_template_styles if style.type == different_styles[i]['style'].type])
|
275 |
+
get_label = lambda i: f"style: {different_styles[i]['style'].name}"
|
276 |
+
newstyles_update_ = {
|
277 |
+
newstyle_comps[i]: gr.update(visible=i < len(different_styles),
|
278 |
+
#sort the styles using levenstein distance function
|
279 |
+
choices=sorted(adapted_template_styles[i], key=lambda x: distance(x, different_styles[i]['style'].name)),
|
280 |
+
value=None,
|
281 |
+
label=get_label(i)) if i < len(different_styles) else ''
|
282 |
+
for i in range(len(different_styles))
|
283 |
+
}
|
284 |
+
update_.update(newstyles_update_)
|
285 |
+
return update_
|
286 |
+
|
287 |
+
def newliststyle_reset():
|
288 |
+
update_ = {
|
289 |
+
list_style_comps[i]: gr.update(visible=False,
|
290 |
+
choices=[],
|
291 |
+
value=None,
|
292 |
+
label='')
|
293 |
+
for i in range(config['max_styles'])
|
294 |
+
}
|
295 |
+
return update_
|
296 |
+
|
297 |
+
def newstyles_reset():
|
298 |
+
update_ = {
|
299 |
+
newstyle_comps[i]: gr.update(visible=False,
|
300 |
+
choices=[],
|
301 |
+
value=None,
|
302 |
+
label='')
|
303 |
+
for i in range(config['max_styles'])
|
304 |
+
}
|
305 |
+
return update_
|
306 |
+
|
307 |
+
def templates_fn(templates_):
|
308 |
+
controller.set_template(templates_)
|
309 |
+
update_ = newstyles_fn()
|
310 |
+
return update_
|
311 |
+
|
312 |
+
templates_radio.change(templates_fn,
|
313 |
+
inputs=[templates_radio],
|
314 |
+
outputs=[newstyles_acc, list_acc] + newstyle_comps + list_style_comps)
|
315 |
+
|
316 |
+
def newstyle_fns(src_index: int):
|
317 |
+
def newstyle_fn(newstyle_):
|
318 |
+
controller.update_style(src_index, newstyle_)
|
319 |
+
return newstyle_fn
|
320 |
+
|
321 |
+
def change_list_style_fn(src_index: int):
|
322 |
+
def change_list_style_fn(list_style_):
|
323 |
+
controller.update_list_style(src_index, list_style_)
|
324 |
+
return change_list_style_fn
|
325 |
+
|
326 |
+
def add_template_fn(template):
|
327 |
+
controller.add_template(template)
|
328 |
+
update_ = {
|
329 |
+
templates_radio: gr.update(choices=[t for t in os.listdir(config['templates_path']) if t.endswith((".docx"))]),
|
330 |
+
templates_radio_modif: gr.update(choices=[t for t in os.listdir(config['templates_path']) if t.endswith((".docx"))]),
|
331 |
+
}
|
332 |
+
return update_
|
333 |
+
|
334 |
+
def delete_curr_template_fn(template):
|
335 |
+
controller.delete_curr_template(template)
|
336 |
+
update_ = {
|
337 |
+
templates_radio: gr.update(choices=[t for t in os.listdir(config['templates_path']) if t.endswith((".docx"))]),
|
338 |
+
templates_radio_modif: gr.update(choices=[t for t in os.listdir(config['templates_path']) if t.endswith((".docx"))]),
|
339 |
+
options_btn: gr.update(value=[]),
|
340 |
+
log_comp: gr.update(value="", visible=False),
|
341 |
+
output_styles_files_comp: gr.update(value=[], visible=False),
|
342 |
+
newstyles_acc: gr.update(open=False),
|
343 |
+
run_style_btn: gr.update(visible=True),
|
344 |
+
list_acc: gr.update(open=False),
|
345 |
+
}
|
346 |
+
return update_
|
347 |
+
|
348 |
+
add_template_btn.upload(add_template_fn,
|
349 |
+
inputs=[add_template_btn],
|
350 |
+
outputs=[templates_radio,templates_radio_modif])
|
351 |
+
|
352 |
+
delete_curr_template_btn.click(delete_curr_template_fn,
|
353 |
+
inputs=[templates_radio],
|
354 |
+
outputs=[templates_radio, options_btn, log_comp, output_styles_files_comp, newstyles_acc, run_style_btn, list_acc, templates_radio_modif])
|
355 |
+
|
356 |
+
for src_index, newstyle_comp in enumerate(newstyle_comps):
|
357 |
+
newstyle_comp.input(newstyle_fns(src_index), inputs=[newstyle_comp], outputs=[],show_progress="full")
|
358 |
+
|
359 |
+
for src_index, list_style_comp in enumerate(list_style_comps):
|
360 |
+
list_style_comp.input(change_list_style_fn(src_index), inputs=[list_style_comp], outputs=[],show_progress="full")
|
361 |
+
|
362 |
+
def clear_style_fn(input_files_):
|
363 |
+
controller.clear_docs()
|
364 |
+
if input_files_:
|
365 |
+
controller.copy_docs(input_files_)
|
366 |
+
controller.set_template()
|
367 |
+
update_ = {
|
368 |
+
options_btn: gr.update(value=[]),
|
369 |
+
log_comp: gr.update(value="", visible=False),
|
370 |
+
output_styles_files_comp: gr.update(value=[], visible=False),
|
371 |
+
newstyles_acc: gr.update(open=False),
|
372 |
+
run_style_btn: gr.update(visible=True),
|
373 |
+
list_acc: gr.update(open=False),
|
374 |
+
templates_radio: gr.update(value=config['templates'][config['default_template_index']]),
|
375 |
+
}
|
376 |
+
newstyles_update_ = newstyles_fn()
|
377 |
+
update_.update(newstyles_update_)
|
378 |
+
return update_
|
379 |
+
|
380 |
+
clear_style_btn.click(clear_style_fn,
|
381 |
+
inputs=[input_files_comp],
|
382 |
+
outputs=[options_btn, output_styles_files_comp, log_comp, newstyles_acc, list_acc, run_style_btn, templates_radio]
|
383 |
+
+ newstyle_comps + list_style_comps
|
384 |
+
)
|
385 |
+
|
386 |
+
def run_style_fn(options_btn_):
|
387 |
+
print(f"options activated : {options_btn_}")
|
388 |
+
controller.apply_template(options_btn_)
|
389 |
+
log = controller.get_log()
|
390 |
+
new_docs_path = controller.generated_docs_path
|
391 |
+
output_paths = [f"{new_docs_path}/{f}" for f in os.listdir(new_docs_path)]
|
392 |
+
print(f"output_paths: {output_paths}")
|
393 |
+
update_ = {
|
394 |
+
log_comp: gr.update(value=log, visible=True),
|
395 |
+
output_styles_files_comp: gr.update(value=output_paths, visible=True),
|
396 |
+
run_style_btn: gr.update(visible=False),
|
397 |
+
}
|
398 |
+
return update_
|
399 |
+
|
400 |
+
|
401 |
+
run_style_btn.click(run_style_fn,
|
402 |
+
inputs=[options_btn],
|
403 |
+
outputs=[log_comp, output_styles_files_comp, run_style_btn] + newstyle_comps, show_progress="full")
|
404 |
+
|
405 |
+
"""
|
406 |
+
=====================================================
|
407 |
+
Generation functions
|
408 |
+
====================
|
409 |
+
"""
|
410 |
+
|
411 |
+
def generate_option_fn(db_collection_):
|
412 |
+
id_ = controller.get_or_create_collection(db_collection_)
|
413 |
+
update_ = {
|
414 |
+
db_collection_var: id_,
|
415 |
+
}
|
416 |
+
return update_
|
417 |
+
|
418 |
+
def wiki_fetch1_fn():
|
419 |
+
"""
|
420 |
+
fetch the wikifiles interesting for solving the tasks as defined in the input doc
|
421 |
+
"""
|
422 |
+
update_ = {
|
423 |
+
wiki_list_comp: gr.update(visible=True),
|
424 |
+
}
|
425 |
+
return update_
|
426 |
+
|
427 |
+
async def wiki_fetch2_fn():
|
428 |
+
"""
|
429 |
+
fetch the wikifiles interesting for solving the tasks as defined in the input doc
|
430 |
+
"""
|
431 |
+
wiki_interesting_files = await controller.wiki_fetch()
|
432 |
+
print(f"wiki_interesting_files: {wiki_interesting_files}")
|
433 |
+
wiki_files = wiki_interesting_files # [w for w in wiki_interesting_files if w not in wiki_db_files_]
|
434 |
+
update_ = {
|
435 |
+
wiki_list_comp: gr.update(visible=True, value=[], choices=wiki_files),
|
436 |
+
wiki_source_var: wiki_interesting_files,
|
437 |
+
wiki_add_to_db_btn: gr.update(visible=True),
|
438 |
+
# wiki_clear_btn: gr.update(visible=True), #Button to clear the choices that are by default all ticked
|
439 |
+
}
|
440 |
+
return update_
|
441 |
+
|
442 |
+
async def wiki_add_to_db_fn(wiki_list_, wiki_source_, wiki_db_, db_list_, db_collection_):
|
443 |
+
"""
|
444 |
+
adds the wikipages to the db source
|
445 |
+
"""
|
446 |
+
wiki_to_add = [wiki for wiki in wiki_list_ if wiki not in wiki_db_]
|
447 |
+
db_list_ += wiki_to_add
|
448 |
+
wiki_db_ += wiki_to_add
|
449 |
+
wiki_source_remaining = [wiki for wiki in wiki_source_ if wiki not in wiki_db_]
|
450 |
+
async_upload_and_store_tasks = [asyncio.create_task(controller.wiki_upload_and_store(wiki, db_collection_)) for wiki in wiki_to_add] # A DEPLACER DANS LE CONTROLLER
|
451 |
+
await asyncio.gather(*async_upload_and_store_tasks)
|
452 |
+
db_not_empty = 0 < len(db_list_)
|
453 |
+
wiki_to_add_not_empty = 0 < len(wiki_source_remaining)
|
454 |
+
update_ = {
|
455 |
+
wiki_db_var: wiki_db_,
|
456 |
+
wiki_list_comp: gr.update(value=[], choices=wiki_source_remaining),
|
457 |
+
wiki_add_to_db_btn: gr.update(visible=wiki_to_add_not_empty),
|
458 |
+
db_list_comp: gr.update(
|
459 |
+
visible=True,
|
460 |
+
value=db_list_,
|
461 |
+
choices=db_list_,
|
462 |
+
label="Database content"),
|
463 |
+
db_reset_btn: gr.update(visible=db_not_empty),
|
464 |
+
generate_btn: gr.update(visible=True, interactive=db_not_empty),
|
465 |
+
}
|
466 |
+
return update_
|
467 |
+
|
468 |
+
def generate_fn1():
|
469 |
+
update_ = {
|
470 |
+
output_files_comp: gr.update(visible=True)
|
471 |
+
}
|
472 |
+
return update_
|
473 |
+
|
474 |
+
async def generate_fn2(db_collection_, db_list_):
|
475 |
+
output_files = await controller.generate_doc_from_db(collection_name=db_collection_,
|
476 |
+
from_files=db_list_)
|
477 |
+
update_ = {
|
478 |
+
output_files_comp: gr.update(value=output_files, visible=True),
|
479 |
+
}
|
480 |
+
return update_
|
481 |
+
|
482 |
+
|
483 |
+
"""
|
484 |
+
=====================================================
|
485 |
+
Generation listeners
|
486 |
+
====================
|
487 |
+
"""
|
488 |
+
|
489 |
+
wiki_fetch_btn \
|
490 |
+
.click(wiki_fetch1_fn, inputs=[], outputs=[wiki_list_comp]) \
|
491 |
+
.then(wiki_fetch2_fn,
|
492 |
+
inputs=[],
|
493 |
+
outputs=[wiki_list_comp, wiki_source_var, wiki_add_to_db_btn])
|
494 |
+
|
495 |
+
wiki_add_to_db_btn\
|
496 |
+
.click(generate_option_fn,
|
497 |
+
inputs=[db_collection_var],
|
498 |
+
outputs=[db_collection_var])\
|
499 |
+
.then(wiki_add_to_db_fn,
|
500 |
+
inputs=[wiki_list_comp, wiki_source_var, wiki_db_var, db_list_comp, db_collection_var],
|
501 |
+
outputs=[db_list_comp, wiki_list_comp, wiki_db_var,
|
502 |
+
generate_btn, wiki_add_to_db_btn, db_reset_btn])
|
503 |
+
|
504 |
+
generate_btn\
|
505 |
+
.click(generate_fn1,
|
506 |
+
inputs=[],
|
507 |
+
outputs=[output_files_comp])\
|
508 |
+
.then(generate_fn2,
|
509 |
+
inputs=[db_collection_var, db_list_comp],
|
510 |
+
outputs=[output_files_comp])
|
511 |
+
|
512 |
+
|
513 |
+
"""
|
514 |
+
=====================================================
|
515 |
+
Clear and rerun functions and listeners
|
516 |
+
=======================================
|
517 |
+
"""
|
518 |
+
|
519 |
+
def clear_fn():
|
520 |
+
update_ = {
|
521 |
+
input_files_comp: gr.update(value=None),
|
522 |
+
output_files_comp: gr.update(value=None, visible=False),
|
523 |
+
clear_btn: gr.update(visible=False),
|
524 |
+
rerun_btn: gr.update(visible=False),
|
525 |
+
}
|
526 |
+
return update_
|
527 |
+
|
528 |
+
clear_btn.click(clear_fn,
|
529 |
+
inputs=[],
|
530 |
+
outputs=[input_files_comp, output_files_comp, clear_btn, rerun_btn])
|
531 |
+
|
532 |
+
# wiki_clear_btn.click(clear_choices_fn, inputs=[], outputs=[wiki_list_comp]) #listener for the clear button of the wiki choices
|
533 |
+
return formatdoc
|
temp/generated_files/file.txt
ADDED
File without changes
|