FatimaGr commited on
Commit
73fbe78
·
verified ·
1 Parent(s): 79590c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -140
app.py CHANGED
@@ -4,57 +4,7 @@ from fastapi.staticfiles import StaticFiles
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer, MarianMTModel, MarianTokenizer
6
  import shutil
7
- #
8
- import os
9
- import logging
10
- from PyPDF2 import PdfReader
11
- import docx
12
- from PIL import Image
13
- import openpyxl # 📌 Pour lire les fichiers Excel (.xlsx)
14
- from pptx import Presentation
15
- import fitz # PyMuPDF
16
- import io
17
- from docx import Document
18
- import matplotlib.pyplot as plt
19
- import seaborn as sns
20
- import torch
21
- import re
22
- import pandas as pd
23
- from transformers import AutoTokenizer, AutoModelForCausalLM
24
- from fastapi.responses import FileResponse
25
- import os
26
- from fastapi.middleware.cors import CORSMiddleware
27
- import matplotlib
28
- matplotlib.use('Agg')
29
 
30
- import re
31
- import torch
32
- import pandas as pd
33
- import matplotlib.pyplot as plt
34
- import seaborn as sns
35
- from transformers import AutoTokenizer, AutoModelForCausalLM
36
- from fastapi import FastAPI, File, UploadFile, Form
37
- from fastapi.responses import FileResponse
38
- import os
39
- from fastapi.middleware.cors import CORSMiddleware
40
- from fastapi import FastAPI, File, UploadFile, Form
41
- from fastapi.responses import JSONResponse, RedirectResponse
42
- from fastapi.staticfiles import StaticFiles
43
- from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
44
- import shutil
45
- import os
46
- import logging
47
- from fastapi.middleware.cors import CORSMiddleware
48
- from PyPDF2 import PdfReader
49
- import docx
50
- from PIL import Image # Pour ouvrir les images avant analyse
51
- from transformers import MarianMTModel, MarianTokenizer
52
- import os
53
- import fitz
54
- from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
55
-
56
- import logging
57
- import openpyxl
58
 
59
 
60
  # Configuration du logging
@@ -62,96 +12,6 @@ logging.basicConfig(level=logging.INFO)
62
 
63
  app = FastAPI()
64
 
65
- # Configuration CORS
66
- app.add_middleware(
67
- CORSMiddleware,
68
- allow_origins=["*"],
69
- allow_credentials=True,
70
- allow_methods=["*"],
71
- allow_headers=["*"],
72
- )
73
-
74
- UPLOAD_DIR = "uploads"
75
- os.makedirs(UPLOAD_DIR, exist_ok=True)
76
-
77
- #traduction-----------------------------------------------------------------------------------------------------------
78
-
79
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
80
- model_name = "facebook/m2m100_418M"
81
- tokenizer = AutoTokenizer.from_pretrained(model_name)
82
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
83
-
84
-
85
- # Fonction pour extraire le texte
86
- def extract_text_from_pdf(file):
87
- doc = fitz.open(stream=file.file.read(), filetype="pdf")
88
- return "\n".join([page.get_text() for page in doc]).strip()
89
-
90
- def extract_text_from_docx(file):
91
- doc = Document(io.BytesIO(file.file.read()))
92
- return "\n".join([para.text for para in doc.paragraphs]).strip()
93
-
94
- def extract_text_from_pptx(file):
95
- prs = Presentation(io.BytesIO(file.file.read()))
96
- return "\n".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")]).strip()
97
-
98
- def extract_text_from_excel(file):
99
- wb = openpyxl.load_workbook(io.BytesIO(file.file.read()), data_only=True)
100
- text = [str(cell) for sheet in wb.worksheets for row in sheet.iter_rows(values_only=True) for cell in row if cell]
101
- return "\n".join(text).strip()
102
-
103
- @app.post("/translate/")
104
- async def translate_document(file: UploadFile = File(...), target_lang: str = Form(...)):
105
- """API pour traduire un document."""
106
- try:
107
- logging.info(f"📥 Fichier reçu : {file.filename}")
108
- logging.info(f"🌍 Langue cible reçue : {target_lang}")
109
-
110
- if model is None or tokenizer is None:
111
- return JSONResponse(status_code=500, content={"error": "Modèle de traduction non chargé"})
112
-
113
- # Extraction du texte
114
- if file.filename.endswith(".pdf"):
115
- text = extract_text_from_pdf(file)
116
- elif file.filename.endswith(".docx"):
117
- text = extract_text_from_docx(file)
118
- elif file.filename.endswith(".pptx"):
119
- text = extract_text_from_pptx(file)
120
- elif file.filename.endswith(".xlsx"):
121
- text = extract_text_from_excel(file)
122
- else:
123
- return JSONResponse(status_code=400, content={"error": "Format non supporté"})
124
-
125
- logging.info(f"📜 Texte extrait : {text[:50]}...")
126
-
127
- if not text:
128
- return JSONResponse(status_code=400, content={"error": "Aucun texte trouvé dans le document"})
129
-
130
- # Vérifier si la langue cible est supportée
131
- target_lang_id = tokenizer.get_lang_id(target_lang)
132
-
133
- if target_lang_id is None:
134
- return JSONResponse(
135
- status_code=400,
136
- content={"error": f"Langue cible '{target_lang}' non supportée. Langues disponibles : {list(tokenizer.lang_code_to_id.keys())}"}
137
- )
138
-
139
- # Traduction
140
- tokenizer.src_lang = "fr"
141
- encoded_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
142
-
143
- logging.info(f"🔍 ID de la langue cible : {target_lang_id}")
144
-
145
- generated_tokens = model.generate(**encoded_text, forced_bos_token_id=target_lang_id)
146
-
147
- translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
148
-
149
- logging.info(f"✅ Traduction réussie : {translated_text[:50]}...")
150
- return {"translated_text": translated_text}
151
-
152
- except Exception as e:
153
- logging.error(f"❌ Erreur lors de la traduction : {e}")
154
- return JSONResponse(status_code=500, content={"error": "Échec de la traduction"})
155
 
156
  # Servir les fichiers statiques (HTML, CSS, JS)
157
  app.mount("/static", StaticFiles(directory="static", html=True), name="static")
 
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer, MarianMTModel, MarianTokenizer
6
  import shutil
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  # Configuration du logging
 
12
 
13
  app = FastAPI()
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Servir les fichiers statiques (HTML, CSS, JS)
17
  app.mount("/static", StaticFiles(directory="static", html=True), name="static")