Spaces:
Runtime error
Runtime error
Commit
·
ea2133c
1
Parent(s):
9e7dc23
fallback
Browse files- Dockerfile +22 -0
- app.py +38 -202
- requirements.txt +5 -15
- text_humanizer.py +188 -378
Dockerfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
RUN useradd -m -u 1000 user
|
| 4 |
+
USER user
|
| 5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 11 |
+
|
| 12 |
+
# download spacy model and nltk resources at build time
|
| 13 |
+
RUN python -m spacy download en_core_web_sm || true
|
| 14 |
+
RUN python - <<'PY'
|
| 15 |
+
from text_humanizer import download_nltk_resources
|
| 16 |
+
download_nltk_resources()
|
| 17 |
+
PY
|
| 18 |
+
|
| 19 |
+
EXPOSE 7860
|
| 20 |
+
|
| 21 |
+
COPY --chown=user . /app
|
| 22 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
|
@@ -1,209 +1,45 @@
|
|
| 1 |
-
# For Hugging Face Spaces - this is the main app file
|
| 2 |
-
import gradio as gr
|
| 3 |
-
import time
|
| 4 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
try:
|
| 12 |
-
humanizer = AITextHumanizer()
|
| 13 |
-
print("✅ Humanizer loaded successfully!")
|
| 14 |
-
except Exception as e:
|
| 15 |
-
print(f"❌ Error loading humanizer: {e}")
|
| 16 |
-
humanizer = None
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
try:
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
intensity=intensity
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
processing_time = (time.time() - start_time) * 1000
|
| 39 |
-
|
| 40 |
-
changes_text = ", ".join(result["changes_made"]) if result["changes_made"] else "No significant changes made"
|
| 41 |
-
|
| 42 |
-
return (
|
| 43 |
-
result["humanized_text"],
|
| 44 |
-
f"**📊 Processing Results:**\n- **Similarity Score:** {result['similarity_score']:.3f}\n- **Processing Time:** {processing_time:.1f}ms\n- **Style:** {result['style'].title()}\n- **Intensity:** {result['intensity']}\n\n**🔄 Changes Made:** {changes_text}",
|
| 45 |
-
result["similarity_score"],
|
| 46 |
-
changes_text,
|
| 47 |
-
processing_time
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
except Exception as e:
|
| 51 |
-
return f"❌ Error processing text: {str(e)}", "", 0.0, "Processing error", 0.0
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
theme=gr.themes.Soft(),
|
| 57 |
-
css="""
|
| 58 |
-
.main-header {
|
| 59 |
-
text-align: center;
|
| 60 |
-
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
|
| 61 |
-
color: white;
|
| 62 |
-
padding: 20px;
|
| 63 |
-
border-radius: 10px;
|
| 64 |
-
margin-bottom: 20px;
|
| 65 |
-
}
|
| 66 |
-
.stats-box {
|
| 67 |
-
background: #f8f9fa;
|
| 68 |
-
padding: 15px;
|
| 69 |
-
border-radius: 8px;
|
| 70 |
-
border-left: 4px solid #667eea;
|
| 71 |
-
}
|
| 72 |
-
"""
|
| 73 |
-
) as iface:
|
| 74 |
-
|
| 75 |
-
gr.HTML("""
|
| 76 |
-
<div class="main-header">
|
| 77 |
-
<h1>🤖➡️👤 AI Text Humanizer</h1>
|
| 78 |
-
<p>Transform AI-generated text to sound more natural and human-like</p>
|
| 79 |
-
<p><em>Powered by advanced NLP techniques and transformers</em></p>
|
| 80 |
-
</div>
|
| 81 |
-
""")
|
| 82 |
-
|
| 83 |
-
with gr.Tab("🎯 Humanize Text"):
|
| 84 |
-
with gr.Row():
|
| 85 |
-
with gr.Column(scale=1):
|
| 86 |
-
gr.HTML("<h3>📝 Input</h3>")
|
| 87 |
-
|
| 88 |
-
input_text = gr.Textbox(
|
| 89 |
-
label="Text to Humanize",
|
| 90 |
-
placeholder="Paste your AI-generated text here...\n\nExample: Furthermore, it is important to note that artificial intelligence systems demonstrate significant capabilities...",
|
| 91 |
-
lines=10,
|
| 92 |
-
max_lines=20
|
| 93 |
-
)
|
| 94 |
-
|
| 95 |
-
with gr.Row():
|
| 96 |
-
style_dropdown = gr.Dropdown(
|
| 97 |
-
choices=["Natural", "Casual", "Conversational"],
|
| 98 |
-
value="Natural",
|
| 99 |
-
label="🎨 Humanization Style"
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
intensity_slider = gr.Slider(
|
| 103 |
-
minimum=0.1,
|
| 104 |
-
maximum=1.0,
|
| 105 |
-
value=0.7,
|
| 106 |
-
step=0.1,
|
| 107 |
-
label="⚡ Intensity Level"
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
humanize_btn = gr.Button(
|
| 111 |
-
"🚀 Humanize Text",
|
| 112 |
-
variant="primary",
|
| 113 |
-
size="lg"
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
with gr.Column(scale=1):
|
| 117 |
-
gr.HTML("<h3>✨ Output</h3>")
|
| 118 |
-
|
| 119 |
-
output_text = gr.Textbox(
|
| 120 |
-
label="Humanized Text",
|
| 121 |
-
lines=10,
|
| 122 |
-
max_lines=20,
|
| 123 |
-
show_copy_button=True
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
stats_output = gr.Markdown(
|
| 127 |
-
label="📊 Processing Statistics",
|
| 128 |
-
value="Results will appear here after processing..."
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
with gr.Tab("📊 Examples & Guide"):
|
| 132 |
-
gr.HTML("<h3>💡 Try These Examples</h3>")
|
| 133 |
-
|
| 134 |
-
# Examples
|
| 135 |
-
examples = gr.Examples(
|
| 136 |
-
examples=[
|
| 137 |
-
[
|
| 138 |
-
"Furthermore, it is important to note that artificial intelligence systems demonstrate significant capabilities in natural language processing tasks. Subsequently, these systems can analyze and generate text with remarkable accuracy. Nevertheless, it is crucial to understand that human oversight remains essential for optimal performance.",
|
| 139 |
-
"Conversational",
|
| 140 |
-
0.8
|
| 141 |
-
],
|
| 142 |
-
[
|
| 143 |
-
"The implementation of this comprehensive solution will facilitate the optimization of business processes and operational workflows. Moreover, it will demonstrate substantial improvements in efficiency metrics while maintaining quality standards.",
|
| 144 |
-
"Natural",
|
| 145 |
-
0.6
|
| 146 |
-
],
|
| 147 |
-
[
|
| 148 |
-
"In conclusion, the systematic analysis reveals that the proposed methodology demonstrates significant potential for enhancing performance indicators. Additionally, the structured approach ensures optimal resource utilization.",
|
| 149 |
-
"Casual",
|
| 150 |
-
0.7
|
| 151 |
-
]
|
| 152 |
-
],
|
| 153 |
-
inputs=[input_text, style_dropdown, intensity_slider],
|
| 154 |
-
outputs=[output_text, stats_output],
|
| 155 |
-
fn=humanize_text_hf,
|
| 156 |
-
cache_examples=False
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
gr.HTML("""
|
| 160 |
-
<div style="margin-top: 30px;">
|
| 161 |
-
<h3>🎯 How It Works</h3>
|
| 162 |
-
<div class="stats-box">
|
| 163 |
-
<h4>🔧 Transformation Techniques:</h4>
|
| 164 |
-
<ul>
|
| 165 |
-
<li><strong>Smart Word Replacement:</strong> formal words → casual alternatives</li>
|
| 166 |
-
<li><strong>Contraction Addition:</strong> "do not" → "don't", "it is" → "it's"</li>
|
| 167 |
-
<li><strong>AI Transition Removal:</strong> removes robotic transition phrases</li>
|
| 168 |
-
<li><strong>Sentence Restructuring:</strong> varies length and structure</li>
|
| 169 |
-
<li><strong>Natural Imperfections:</strong> adds human-like variations</li>
|
| 170 |
-
<li><strong>Context-Aware Paraphrasing:</strong> maintains meaning while improving flow</li>
|
| 171 |
-
</ul>
|
| 172 |
-
</div>
|
| 173 |
-
|
| 174 |
-
<div class="stats-box" style="margin-top: 15px;">
|
| 175 |
-
<h4>🎨 Style Guide:</h4>
|
| 176 |
-
<ul>
|
| 177 |
-
<li><strong>Natural (0.5-0.7):</strong> Professional content with human touch</li>
|
| 178 |
-
<li><strong>Casual (0.6-0.8):</strong> Blog posts, articles, informal content</li>
|
| 179 |
-
<li><strong>Conversational (0.7-1.0):</strong> Social media, very informal text</li>
|
| 180 |
-
</ul>
|
| 181 |
-
</div>
|
| 182 |
-
|
| 183 |
-
<div class="stats-box" style="margin-top: 15px;">
|
| 184 |
-
<h4>⚡ Performance:</h4>
|
| 185 |
-
<ul>
|
| 186 |
-
<li><strong>Similarity Preservation:</strong> Maintains 85-95% semantic similarity</li>
|
| 187 |
-
<li><strong>Processing Speed:</strong> ~500ms for typical paragraphs</li>
|
| 188 |
-
<li><strong>Quality:</strong> Advanced NLP models ensure high-quality output</li>
|
| 189 |
-
</ul>
|
| 190 |
-
</div>
|
| 191 |
-
</div>
|
| 192 |
-
""")
|
| 193 |
-
|
| 194 |
-
# Event handlers
|
| 195 |
-
humanize_btn.click(
|
| 196 |
-
fn=humanize_text_hf,
|
| 197 |
-
inputs=[input_text, style_dropdown, intensity_slider],
|
| 198 |
-
outputs=[output_text, stats_output]
|
| 199 |
-
)
|
| 200 |
|
| 201 |
-
#
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
iface.launch(
|
| 205 |
-
share=False, # HF Spaces handles sharing
|
| 206 |
-
server_name="0.0.0.0",
|
| 207 |
-
server_port=7860,
|
| 208 |
-
show_error=True
|
| 209 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from fastapi import FastAPI, Header, HTTPException, Depends
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from text_humanizer import TextHumanizer, download_nltk_resources
|
| 5 |
+
import spacy
|
| 6 |
|
| 7 |
+
API_KEY = os.environ.get("API_KEY", "dev-key")
|
| 8 |
+
PORT = int(os.environ.get("PORT", 7860))
|
| 9 |
|
| 10 |
+
app = FastAPI()
|
| 11 |
+
humanizer = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
class HumanizeReq(BaseModel):
|
| 14 |
+
text: str
|
| 15 |
+
use_passive: bool = False
|
| 16 |
+
use_synonyms: bool = False
|
| 17 |
+
|
| 18 |
+
def verify_key(x_api_key: str = Header(None)):
|
| 19 |
+
if x_api_key != API_KEY:
|
| 20 |
+
raise HTTPException(status_code=403, detail="Forbidden")
|
| 21 |
+
return True
|
| 22 |
+
|
| 23 |
+
@app.get("/")
|
| 24 |
+
def greet_json():
|
| 25 |
+
return {"Hello": "World!"}
|
| 26 |
+
|
| 27 |
+
@app.on_event("startup")
|
| 28 |
+
def startup():
|
| 29 |
+
# ensure NLTK resources and spacy model are available at runtime
|
| 30 |
+
download_nltk_resources()
|
| 31 |
try:
|
| 32 |
+
spacy.load("en_core_web_sm")
|
| 33 |
+
except OSError:
|
| 34 |
+
import spacy.cli
|
| 35 |
+
spacy.cli.download("en_core_web_sm")
|
| 36 |
+
global humanizer
|
| 37 |
+
humanizer = TextHumanizer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
@app.post("/humanize")
|
| 40 |
+
def humanize(req: HumanizeReq, _=Depends(verify_key)):
|
| 41 |
+
return {"humanized": humanizer.humanize_text(req.text, req.use_passive, req.use_synonyms)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
# if __name__ == "__main__":
|
| 44 |
+
# import uvicorn
|
| 45 |
+
# uvicorn.run(app, host="0.0.0.0", port=PORT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,15 +1,5 @@
|
|
| 1 |
-
fastapi
|
| 2 |
-
uvicorn[standard]
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
sentence-transformers==2.2.2
|
| 7 |
-
nltk==3.8.1
|
| 8 |
-
spacy>=3.7.0
|
| 9 |
-
pydantic==2.5.0
|
| 10 |
-
numpy==1.25.2
|
| 11 |
-
pandas==2.1.3
|
| 12 |
-
redis==5.0.1
|
| 13 |
-
python-multipart==0.0.6
|
| 14 |
-
aiofiles==23.2.1
|
| 15 |
-
requests==2.31.0
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
spacy
|
| 4 |
+
nltk
|
| 5 |
+
sentence-transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_humanizer.py
CHANGED
|
@@ -1,390 +1,200 @@
|
|
| 1 |
-
import
|
| 2 |
import random
|
| 3 |
-
import
|
| 4 |
-
from typing import List, Dict, Optional
|
| 5 |
-
from sentence_transformers import SentenceTransformer
|
| 6 |
-
import numpy as np
|
| 7 |
-
from transformers import pipeline
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
| 15 |
-
|
| 16 |
-
nltk.data.find('corpora/wordnet')
|
| 17 |
-
except LookupError:
|
| 18 |
-
nltk.download('wordnet')
|
| 19 |
|
| 20 |
-
|
| 21 |
-
nltk.data.find('corpora/omw-1.4')
|
| 22 |
-
except LookupError:
|
| 23 |
-
nltk.download('omw-1.4')
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
"""Initialize the text humanizer with necessary models and data"""
|
| 31 |
-
print("Loading models...")
|
| 32 |
-
|
| 33 |
-
# Load sentence transformer for semantic similarity
|
| 34 |
try:
|
| 35 |
-
|
| 36 |
except Exception as e:
|
| 37 |
-
print(f"
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
"
|
| 73 |
-
"
|
| 74 |
-
"establish": "set up",
|
| 75 |
-
"magnitude": "size",
|
| 76 |
-
"comprehensive": "complete",
|
| 77 |
-
"significant": "big",
|
| 78 |
-
"substantial": "large",
|
| 79 |
-
"optimal": "best",
|
| 80 |
-
"sufficient": "enough",
|
| 81 |
-
"prior to": "before",
|
| 82 |
-
"in order to": "to",
|
| 83 |
-
"due to the fact that": "because",
|
| 84 |
-
"at this point in time": "now",
|
| 85 |
-
"in the event that": "if",
|
| 86 |
-
}
|
| 87 |
-
|
| 88 |
-
# Contractions mapping
|
| 89 |
-
self.contractions = {
|
| 90 |
-
"do not": "don't",
|
| 91 |
-
"does not": "doesn't",
|
| 92 |
-
"did not": "didn't",
|
| 93 |
-
"will not": "won't",
|
| 94 |
-
"would not": "wouldn't",
|
| 95 |
-
"should not": "shouldn't",
|
| 96 |
-
"could not": "couldn't",
|
| 97 |
-
"cannot": "can't",
|
| 98 |
-
"is not": "isn't",
|
| 99 |
-
"are not": "aren't",
|
| 100 |
-
"was not": "wasn't",
|
| 101 |
-
"were not": "weren't",
|
| 102 |
-
"have not": "haven't",
|
| 103 |
-
"has not": "hasn't",
|
| 104 |
-
"had not": "hadn't",
|
| 105 |
-
"I am": "I'm",
|
| 106 |
-
"you are": "you're",
|
| 107 |
-
"he is": "he's",
|
| 108 |
-
"she is": "she's",
|
| 109 |
-
"it is": "it's",
|
| 110 |
-
"we are": "we're",
|
| 111 |
-
"they are": "they're",
|
| 112 |
-
"I have": "I've",
|
| 113 |
-
"you have": "you've",
|
| 114 |
-
"we have": "we've",
|
| 115 |
-
"they have": "they've",
|
| 116 |
-
"I will": "I'll",
|
| 117 |
-
"you will": "you'll",
|
| 118 |
-
"he will": "he'll",
|
| 119 |
-
"she will": "she'll",
|
| 120 |
-
"it will": "it'll",
|
| 121 |
-
"we will": "we'll",
|
| 122 |
-
"they will": "they'll",
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
-
# Transition words that make text sound more AI-like
|
| 126 |
-
self.ai_transition_words = [
|
| 127 |
-
"Furthermore,", "Moreover,", "Additionally,", "Subsequently,",
|
| 128 |
-
"Consequently,", "Therefore,", "Nevertheless,", "However,",
|
| 129 |
-
"In conclusion,", "To summarize,", "In summary,", "Overall,",
|
| 130 |
-
"It is important to note that", "It should be emphasized that",
|
| 131 |
-
"It is worth mentioning that", "It is crucial to understand that"
|
| 132 |
-
]
|
| 133 |
-
|
| 134 |
-
# Natural alternatives
|
| 135 |
-
self.natural_transitions = [
|
| 136 |
-
"Also,", "Plus,", "And,", "Then,", "So,", "But,", "Still,",
|
| 137 |
-
"Anyway,", "By the way,", "Actually,", "Basically,",
|
| 138 |
-
"Look,", "Listen,", "Here's the thing:", "The point is,",
|
| 139 |
-
"What's more,", "On top of that,", "Another thing,",
|
| 140 |
]
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
for
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
if len(words) > 1 and words[0].lower() in ['and', 'but', 'or', 'so']:
|
| 225 |
-
sentence = words[0].lower() + ' ' + ' '.join(words[1:])
|
| 226 |
-
|
| 227 |
-
# Sometimes use informal punctuation
|
| 228 |
-
if random.random() < imperfection_rate:
|
| 229 |
-
if sentence.endswith('.'):
|
| 230 |
-
sentence = sentence[:-1] # Remove period occasionally
|
| 231 |
-
elif not sentence.endswith(('.', '!', '?')):
|
| 232 |
-
if random.random() < 0.5:
|
| 233 |
-
sentence += '.'
|
| 234 |
-
|
| 235 |
-
modified_sentences.append(sentence)
|
| 236 |
-
|
| 237 |
-
return ' '.join(modified_sentences)
|
| 238 |
-
|
| 239 |
-
def paraphrase_segments(self, text: str, paraphrase_rate: float = 0.3) -> str:
|
| 240 |
-
"""Paraphrase some segments using the transformer model"""
|
| 241 |
-
if not self.paraphraser:
|
| 242 |
-
return text
|
| 243 |
-
|
| 244 |
-
sentences = sent_tokenize(text)
|
| 245 |
-
paraphrased_sentences = []
|
| 246 |
-
|
| 247 |
-
for sentence in sentences:
|
| 248 |
-
if random.random() < paraphrase_rate and len(sentence.split()) > 5:
|
| 249 |
-
try:
|
| 250 |
-
# Create paraphrase prompt
|
| 251 |
-
prompt = f"Rewrite this sentence in a more natural, conversational way: {sentence}"
|
| 252 |
-
|
| 253 |
-
result = self.paraphraser(prompt, max_length=100, num_return_sequences=1)
|
| 254 |
-
paraphrased = result[0]['generated_text']
|
| 255 |
-
|
| 256 |
-
# Clean up the result
|
| 257 |
-
paraphrased = paraphrased.replace(prompt, '').strip()
|
| 258 |
-
if paraphrased and len(paraphrased) > 10:
|
| 259 |
-
paraphrased_sentences.append(paraphrased)
|
| 260 |
else:
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
paraphrased_sentences.append(sentence)
|
| 265 |
else:
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
return ' '.join(paraphrased_sentences)
|
| 269 |
-
|
| 270 |
-
def calculate_similarity(self, text1: str, text2: str) -> float:
|
| 271 |
-
"""Calculate semantic similarity between original and humanized text"""
|
| 272 |
-
if not self.similarity_model:
|
| 273 |
-
return 0.85 # Return reasonable default if model not available
|
| 274 |
-
|
| 275 |
-
try:
|
| 276 |
-
embeddings1 = self.similarity_model.encode([text1])
|
| 277 |
-
embeddings2 = self.similarity_model.encode([text2])
|
| 278 |
-
similarity = np.dot(embeddings1[0], embeddings2[0]) / (
|
| 279 |
-
np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0])
|
| 280 |
-
)
|
| 281 |
-
return float(similarity)
|
| 282 |
-
except Exception as e:
|
| 283 |
-
print(f"Similarity calculation failed: {e}")
|
| 284 |
-
return 0.85
|
| 285 |
-
|
| 286 |
-
def humanize_text(self,
|
| 287 |
-
text: str,
|
| 288 |
-
style: str = "natural",
|
| 289 |
-
intensity: float = 0.7) -> Dict:
|
| 290 |
-
"""
|
| 291 |
-
Main humanization function
|
| 292 |
-
|
| 293 |
-
Args:
|
| 294 |
-
text: Input text to humanize
|
| 295 |
-
style: Style of humanization ('natural', 'casual', 'conversational')
|
| 296 |
-
intensity: Intensity of humanization (0.0 to 1.0)
|
| 297 |
-
|
| 298 |
-
Returns:
|
| 299 |
-
Dictionary with humanized text and metadata
|
| 300 |
-
"""
|
| 301 |
-
if not text.strip():
|
| 302 |
-
return {
|
| 303 |
-
"original_text": text,
|
| 304 |
-
"humanized_text": text,
|
| 305 |
-
"similarity_score": 1.0,
|
| 306 |
-
"changes_made": []
|
| 307 |
-
}
|
| 308 |
-
|
| 309 |
-
changes_made = []
|
| 310 |
-
humanized_text = text
|
| 311 |
-
|
| 312 |
-
# Apply transformations based on intensity
|
| 313 |
-
if intensity > 0.2:
|
| 314 |
-
# Replace formal words
|
| 315 |
-
before_formal = humanized_text
|
| 316 |
-
humanized_text = self.replace_formal_words(humanized_text, intensity * 0.7)
|
| 317 |
-
if humanized_text != before_formal:
|
| 318 |
-
changes_made.append("Replaced formal words with casual alternatives")
|
| 319 |
-
|
| 320 |
-
if intensity > 0.3:
|
| 321 |
-
# Add contractions
|
| 322 |
-
before_contractions = humanized_text
|
| 323 |
-
humanized_text = self.add_contractions(humanized_text)
|
| 324 |
-
if humanized_text != before_contractions:
|
| 325 |
-
changes_made.append("Added contractions")
|
| 326 |
-
|
| 327 |
-
if intensity > 0.4:
|
| 328 |
-
# Replace AI-like transitions
|
| 329 |
-
before_transitions = humanized_text
|
| 330 |
-
humanized_text = self.replace_ai_transitions(humanized_text)
|
| 331 |
-
if humanized_text != before_transitions:
|
| 332 |
-
changes_made.append("Replaced AI-like transition words")
|
| 333 |
-
|
| 334 |
-
if intensity > 0.5:
|
| 335 |
-
# Vary sentence structure
|
| 336 |
-
before_structure = humanized_text
|
| 337 |
-
humanized_text = self.vary_sentence_structure(humanized_text)
|
| 338 |
-
if humanized_text != before_structure:
|
| 339 |
-
changes_made.append("Varied sentence structure")
|
| 340 |
-
|
| 341 |
-
if intensity > 0.6 and style in ["casual", "conversational"]:
|
| 342 |
-
# Add natural imperfections
|
| 343 |
-
before_imperfections = humanized_text
|
| 344 |
-
humanized_text = self.add_natural_imperfections(humanized_text, intensity * 0.2)
|
| 345 |
-
if humanized_text != before_imperfections:
|
| 346 |
-
changes_made.append("Added natural imperfections")
|
| 347 |
-
|
| 348 |
-
if intensity > 0.7:
|
| 349 |
-
# Paraphrase some segments
|
| 350 |
-
before_paraphrase = humanized_text
|
| 351 |
-
humanized_text = self.paraphrase_segments(humanized_text, intensity * 0.4)
|
| 352 |
-
if humanized_text != before_paraphrase:
|
| 353 |
-
changes_made.append("Paraphrased some segments")
|
| 354 |
-
|
| 355 |
-
# Calculate similarity
|
| 356 |
-
similarity_score = self.calculate_similarity(text, humanized_text)
|
| 357 |
-
|
| 358 |
-
return {
|
| 359 |
-
"original_text": text,
|
| 360 |
-
"humanized_text": humanized_text,
|
| 361 |
-
"similarity_score": similarity_score,
|
| 362 |
-
"changes_made": changes_made,
|
| 363 |
-
"style": style,
|
| 364 |
-
"intensity": intensity
|
| 365 |
-
}
|
| 366 |
|
| 367 |
-
#
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ssl
|
| 2 |
import random
|
| 3 |
+
import warnings
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
import nltk
|
| 6 |
+
import spacy
|
| 7 |
+
from nltk.tokenize import word_tokenize
|
| 8 |
+
from nltk.corpus import wordnet
|
| 9 |
+
from sentence_transformers import SentenceTransformer, util
|
| 10 |
|
| 11 |
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
NLP_GLOBAL = spacy.load("en_core_web_sm")
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
def download_nltk_resources():
|
| 16 |
+
"""
|
| 17 |
+
Download required NLTK resources if not already installed.
|
| 18 |
+
"""
|
| 19 |
+
try:
|
| 20 |
+
_create_unverified_https_context = ssl._create_unverified_context
|
| 21 |
+
except AttributeError:
|
| 22 |
+
pass
|
| 23 |
+
else:
|
| 24 |
+
ssl._create_default_https_context = _create_unverified_https_context
|
| 25 |
|
| 26 |
+
resources = ['punkt', 'averaged_perceptron_tagger', 'punkt_tab','wordnet','averaged_perceptron_tagger_eng']
|
| 27 |
+
for resource in resources:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
try:
|
| 29 |
+
nltk.download(resource, quiet=True)
|
| 30 |
except Exception as e:
|
| 31 |
+
print(f"Error downloading {resource}: {str(e)}")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# This class contains methods to humanize academic text, such as improving readability or
|
| 35 |
+
# simplifying complex language.
|
| 36 |
+
class TextHumanizer:
|
| 37 |
+
"""
|
| 38 |
+
Transforms text into a more formal (academic) style:
|
| 39 |
+
- Expands contractions
|
| 40 |
+
- Adds academic transitions
|
| 41 |
+
- Optionally converts some sentences to passive voice
|
| 42 |
+
- Optionally replaces words with synonyms for more formality
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
def __init__(
|
| 46 |
+
self,
|
| 47 |
+
model_name='paraphrase-MiniLM-L6-v2',
|
| 48 |
+
p_passive=0.2,
|
| 49 |
+
p_synonym_replacement=0.3,
|
| 50 |
+
p_academic_transition=0.3,
|
| 51 |
+
seed=None
|
| 52 |
+
):
|
| 53 |
+
if seed is not None:
|
| 54 |
+
random.seed(seed)
|
| 55 |
+
|
| 56 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 57 |
+
self.model = SentenceTransformer(model_name)
|
| 58 |
+
|
| 59 |
+
# Transformation probabilities
|
| 60 |
+
self.p_passive = p_passive
|
| 61 |
+
self.p_synonym_replacement = p_synonym_replacement
|
| 62 |
+
self.p_academic_transition = p_academic_transition
|
| 63 |
+
|
| 64 |
+
# Common academic transitions
|
| 65 |
+
self.academic_transitions = [
|
| 66 |
+
"Moreover,", "Additionally,", "Furthermore,", "Hence,",
|
| 67 |
+
"Therefore,", "Consequently,", "Nonetheless,", "Nevertheless,"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
]
|
| 69 |
+
|
| 70 |
+
def humanize_text(self, text, use_passive=False, use_synonyms=False):
|
| 71 |
+
doc = self.nlp(text)
|
| 72 |
+
transformed_sentences = []
|
| 73 |
+
|
| 74 |
+
for sent in doc.sents:
|
| 75 |
+
sentence_str = sent.text.strip()
|
| 76 |
+
|
| 77 |
+
# 1. Expand contractions
|
| 78 |
+
sentence_str = self.expand_contractions(sentence_str)
|
| 79 |
+
|
| 80 |
+
# 2. Possibly add academic transitions
|
| 81 |
+
# if random.random() < self.p_academic_transition:
|
| 82 |
+
# sentence_str = self.add_academic_transitions(sentence_str)
|
| 83 |
+
|
| 84 |
+
# 3. Optionally convert to passive
|
| 85 |
+
if use_passive and random.random() < self.p_passive:
|
| 86 |
+
sentence_str = self.convert_to_passive(sentence_str)
|
| 87 |
+
|
| 88 |
+
# 4. Optionally replace words with synonyms
|
| 89 |
+
if use_synonyms and random.random() < self.p_synonym_replacement:
|
| 90 |
+
sentence_str = self.replace_with_synonyms(sentence_str)
|
| 91 |
+
|
| 92 |
+
transformed_sentences.append(sentence_str)
|
| 93 |
+
|
| 94 |
+
return ' '.join(transformed_sentences)
|
| 95 |
+
|
| 96 |
+
def expand_contractions(self, sentence):
|
| 97 |
+
contraction_map = {
|
| 98 |
+
"n't": " not", "'re": " are", "'s": " is", "'ll": " will",
|
| 99 |
+
"'ve": " have", "'d": " would", "'m": " am"
|
| 100 |
+
}
|
| 101 |
+
tokens = word_tokenize(sentence)
|
| 102 |
+
expanded_tokens = []
|
| 103 |
+
for token in tokens:
|
| 104 |
+
lower_token = token.lower()
|
| 105 |
+
replaced = False
|
| 106 |
+
for contraction, expansion in contraction_map.items():
|
| 107 |
+
if contraction in lower_token and lower_token.endswith(contraction):
|
| 108 |
+
new_token = lower_token.replace(contraction, expansion)
|
| 109 |
+
if token[0].isupper():
|
| 110 |
+
new_token = new_token.capitalize()
|
| 111 |
+
expanded_tokens.append(new_token)
|
| 112 |
+
replaced = True
|
| 113 |
+
break
|
| 114 |
+
if not replaced:
|
| 115 |
+
expanded_tokens.append(token)
|
| 116 |
+
|
| 117 |
+
return ' '.join(expanded_tokens)
|
| 118 |
+
|
| 119 |
+
def add_academic_transitions(self, sentence):
|
| 120 |
+
transition = random.choice(self.academic_transitions)
|
| 121 |
+
return f"{transition} {sentence}"
|
| 122 |
+
|
| 123 |
+
def convert_to_passive(self, sentence):
|
| 124 |
+
doc = self.nlp(sentence)
|
| 125 |
+
subj_tokens = [t for t in doc if t.dep_ == 'nsubj' and t.head.dep_ == 'ROOT']
|
| 126 |
+
dobj_tokens = [t for t in doc if t.dep_ == 'dobj']
|
| 127 |
+
|
| 128 |
+
if subj_tokens and dobj_tokens:
|
| 129 |
+
subject = subj_tokens[0]
|
| 130 |
+
dobj = dobj_tokens[0]
|
| 131 |
+
verb = subject.head
|
| 132 |
+
if subject.i < verb.i < dobj.i:
|
| 133 |
+
passive_str = f"{dobj.text} {verb.lemma_} by {subject.text}"
|
| 134 |
+
original_str = ' '.join(token.text for token in doc)
|
| 135 |
+
chunk = f"{subject.text} {verb.text} {dobj.text}"
|
| 136 |
+
if chunk in original_str:
|
| 137 |
+
sentence = original_str.replace(chunk, passive_str)
|
| 138 |
+
return sentence
|
| 139 |
+
|
| 140 |
+
def replace_with_synonyms(self, sentence):
|
| 141 |
+
tokens = word_tokenize(sentence)
|
| 142 |
+
pos_tags = nltk.pos_tag(tokens)
|
| 143 |
+
|
| 144 |
+
new_tokens = []
|
| 145 |
+
for (word, pos) in pos_tags:
|
| 146 |
+
if pos.startswith(('J', 'N', 'V', 'R')) and wordnet.synsets(word):
|
| 147 |
+
if random.random() < 0.5:
|
| 148 |
+
synonyms = self._get_synonyms(word, pos)
|
| 149 |
+
if synonyms:
|
| 150 |
+
best_synonym = self._select_closest_synonym(word, synonyms)
|
| 151 |
+
new_tokens.append(best_synonym if best_synonym else word)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
else:
|
| 153 |
+
new_tokens.append(word)
|
| 154 |
+
else:
|
| 155 |
+
new_tokens.append(word)
|
|
|
|
| 156 |
else:
|
| 157 |
+
new_tokens.append(word)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
+
# Join cleanly with punctuation fix
|
| 160 |
+
sentence = " ".join(new_tokens)
|
| 161 |
+
sentence = (
|
| 162 |
+
sentence.replace(" ,", ",")
|
| 163 |
+
.replace(" .", ".")
|
| 164 |
+
.replace(" !", "!")
|
| 165 |
+
.replace(" ?", "?")
|
| 166 |
+
.replace(" :", ":")
|
| 167 |
+
.replace(" '", "'")
|
| 168 |
+
)
|
| 169 |
+
return sentence
|
| 170 |
+
|
| 171 |
+
def _get_synonyms(self, word, pos):
|
| 172 |
+
wn_pos = None
|
| 173 |
+
if pos.startswith('J'):
|
| 174 |
+
wn_pos = wordnet.ADJ
|
| 175 |
+
elif pos.startswith('N'):
|
| 176 |
+
wn_pos = wordnet.NOUN
|
| 177 |
+
elif pos.startswith('R'):
|
| 178 |
+
wn_pos = wordnet.ADV
|
| 179 |
+
elif pos.startswith('V'):
|
| 180 |
+
wn_pos = wordnet.VERB
|
| 181 |
+
|
| 182 |
+
synonyms = set()
|
| 183 |
+
for syn in wordnet.synsets(word, pos=wn_pos):
|
| 184 |
+
for lemma in syn.lemmas():
|
| 185 |
+
lemma_name = lemma.name().replace('_', ' ')
|
| 186 |
+
if lemma_name.lower() != word.lower():
|
| 187 |
+
synonyms.add(lemma_name)
|
| 188 |
+
return list(synonyms)
|
| 189 |
+
|
| 190 |
+
def _select_closest_synonym(self, original_word, synonyms):
|
| 191 |
+
if not synonyms:
|
| 192 |
+
return None
|
| 193 |
+
original_emb = self.model.encode(original_word, convert_to_tensor=True)
|
| 194 |
+
synonym_embs = self.model.encode(synonyms, convert_to_tensor=True)
|
| 195 |
+
cos_scores = util.cos_sim(original_emb, synonym_embs)[0]
|
| 196 |
+
max_score_index = cos_scores.argmax().item()
|
| 197 |
+
max_score = cos_scores[max_score_index].item()
|
| 198 |
+
if max_score >= 0.5:
|
| 199 |
+
return synonyms[max_score_index]
|
| 200 |
+
return None
|