Jay-Rajput commited on
Commit
ea2133c
·
1 Parent(s): 9e7dc23
Files changed (4) hide show
  1. Dockerfile +22 -0
  2. app.py +38 -202
  3. requirements.txt +5 -15
  4. text_humanizer.py +188 -378
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ # download spacy model and nltk resources at build time
13
+ RUN python -m spacy download en_core_web_sm || true
14
+ RUN python - <<'PY'
15
+ from text_humanizer import download_nltk_resources
16
+ download_nltk_resources()
17
+ PY
18
+
19
+ EXPOSE 7860
20
+
21
+ COPY --chown=user . /app
22
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,209 +1,45 @@
1
- # For Hugging Face Spaces - this is the main app file
2
- import gradio as gr
3
- import time
4
  import os
 
 
 
 
5
 
6
- # Import our humanizer
7
- from text_humanizer import AITextHumanizer
8
 
9
- # Initialize the humanizer
10
- print("🚀 Loading AI Text Humanizer for Hugging Face Spaces...")
11
- try:
12
- humanizer = AITextHumanizer()
13
- print("✅ Humanizer loaded successfully!")
14
- except Exception as e:
15
- print(f"❌ Error loading humanizer: {e}")
16
- humanizer = None
17
 
18
- def humanize_text_hf(text, style, intensity):
19
- """
20
- Hugging Face Spaces interface function for text humanization
21
- """
22
- if not text.strip():
23
- return "⚠️ Please enter some text to humanize.", "", 0.0, "No changes made", 0.0
24
-
25
- if humanizer is None:
26
- return "❌ Error: Humanizer not loaded properly.", "", 0.0, "System error", 0.0
27
-
 
 
 
 
 
 
 
 
28
  try:
29
- start_time = time.time()
30
-
31
- # Humanize the text
32
- result = humanizer.humanize_text(
33
- text=text,
34
- style=style.lower(),
35
- intensity=intensity
36
- )
37
-
38
- processing_time = (time.time() - start_time) * 1000
39
-
40
- changes_text = ", ".join(result["changes_made"]) if result["changes_made"] else "No significant changes made"
41
-
42
- return (
43
- result["humanized_text"],
44
- f"**📊 Processing Results:**\n- **Similarity Score:** {result['similarity_score']:.3f}\n- **Processing Time:** {processing_time:.1f}ms\n- **Style:** {result['style'].title()}\n- **Intensity:** {result['intensity']}\n\n**🔄 Changes Made:** {changes_text}",
45
- result["similarity_score"],
46
- changes_text,
47
- processing_time
48
- )
49
-
50
- except Exception as e:
51
- return f"❌ Error processing text: {str(e)}", "", 0.0, "Processing error", 0.0
52
 
53
- # Create the Hugging Face Spaces interface
54
- with gr.Blocks(
55
- title="🤖➡️👤 AI Text Humanizer",
56
- theme=gr.themes.Soft(),
57
- css="""
58
- .main-header {
59
- text-align: center;
60
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
61
- color: white;
62
- padding: 20px;
63
- border-radius: 10px;
64
- margin-bottom: 20px;
65
- }
66
- .stats-box {
67
- background: #f8f9fa;
68
- padding: 15px;
69
- border-radius: 8px;
70
- border-left: 4px solid #667eea;
71
- }
72
- """
73
- ) as iface:
74
-
75
- gr.HTML("""
76
- <div class="main-header">
77
- <h1>🤖➡️👤 AI Text Humanizer</h1>
78
- <p>Transform AI-generated text to sound more natural and human-like</p>
79
- <p><em>Powered by advanced NLP techniques and transformers</em></p>
80
- </div>
81
- """)
82
-
83
- with gr.Tab("🎯 Humanize Text"):
84
- with gr.Row():
85
- with gr.Column(scale=1):
86
- gr.HTML("<h3>📝 Input</h3>")
87
-
88
- input_text = gr.Textbox(
89
- label="Text to Humanize",
90
- placeholder="Paste your AI-generated text here...\n\nExample: Furthermore, it is important to note that artificial intelligence systems demonstrate significant capabilities...",
91
- lines=10,
92
- max_lines=20
93
- )
94
-
95
- with gr.Row():
96
- style_dropdown = gr.Dropdown(
97
- choices=["Natural", "Casual", "Conversational"],
98
- value="Natural",
99
- label="🎨 Humanization Style"
100
- )
101
-
102
- intensity_slider = gr.Slider(
103
- minimum=0.1,
104
- maximum=1.0,
105
- value=0.7,
106
- step=0.1,
107
- label="⚡ Intensity Level"
108
- )
109
-
110
- humanize_btn = gr.Button(
111
- "🚀 Humanize Text",
112
- variant="primary",
113
- size="lg"
114
- )
115
-
116
- with gr.Column(scale=1):
117
- gr.HTML("<h3>✨ Output</h3>")
118
-
119
- output_text = gr.Textbox(
120
- label="Humanized Text",
121
- lines=10,
122
- max_lines=20,
123
- show_copy_button=True
124
- )
125
-
126
- stats_output = gr.Markdown(
127
- label="📊 Processing Statistics",
128
- value="Results will appear here after processing..."
129
- )
130
-
131
- with gr.Tab("📊 Examples & Guide"):
132
- gr.HTML("<h3>💡 Try These Examples</h3>")
133
-
134
- # Examples
135
- examples = gr.Examples(
136
- examples=[
137
- [
138
- "Furthermore, it is important to note that artificial intelligence systems demonstrate significant capabilities in natural language processing tasks. Subsequently, these systems can analyze and generate text with remarkable accuracy. Nevertheless, it is crucial to understand that human oversight remains essential for optimal performance.",
139
- "Conversational",
140
- 0.8
141
- ],
142
- [
143
- "The implementation of this comprehensive solution will facilitate the optimization of business processes and operational workflows. Moreover, it will demonstrate substantial improvements in efficiency metrics while maintaining quality standards.",
144
- "Natural",
145
- 0.6
146
- ],
147
- [
148
- "In conclusion, the systematic analysis reveals that the proposed methodology demonstrates significant potential for enhancing performance indicators. Additionally, the structured approach ensures optimal resource utilization.",
149
- "Casual",
150
- 0.7
151
- ]
152
- ],
153
- inputs=[input_text, style_dropdown, intensity_slider],
154
- outputs=[output_text, stats_output],
155
- fn=humanize_text_hf,
156
- cache_examples=False
157
- )
158
-
159
- gr.HTML("""
160
- <div style="margin-top: 30px;">
161
- <h3>🎯 How It Works</h3>
162
- <div class="stats-box">
163
- <h4>🔧 Transformation Techniques:</h4>
164
- <ul>
165
- <li><strong>Smart Word Replacement:</strong> formal words → casual alternatives</li>
166
- <li><strong>Contraction Addition:</strong> "do not" → "don't", "it is" → "it's"</li>
167
- <li><strong>AI Transition Removal:</strong> removes robotic transition phrases</li>
168
- <li><strong>Sentence Restructuring:</strong> varies length and structure</li>
169
- <li><strong>Natural Imperfections:</strong> adds human-like variations</li>
170
- <li><strong>Context-Aware Paraphrasing:</strong> maintains meaning while improving flow</li>
171
- </ul>
172
- </div>
173
-
174
- <div class="stats-box" style="margin-top: 15px;">
175
- <h4>🎨 Style Guide:</h4>
176
- <ul>
177
- <li><strong>Natural (0.5-0.7):</strong> Professional content with human touch</li>
178
- <li><strong>Casual (0.6-0.8):</strong> Blog posts, articles, informal content</li>
179
- <li><strong>Conversational (0.7-1.0):</strong> Social media, very informal text</li>
180
- </ul>
181
- </div>
182
-
183
- <div class="stats-box" style="margin-top: 15px;">
184
- <h4>⚡ Performance:</h4>
185
- <ul>
186
- <li><strong>Similarity Preservation:</strong> Maintains 85-95% semantic similarity</li>
187
- <li><strong>Processing Speed:</strong> ~500ms for typical paragraphs</li>
188
- <li><strong>Quality:</strong> Advanced NLP models ensure high-quality output</li>
189
- </ul>
190
- </div>
191
- </div>
192
- """)
193
-
194
- # Event handlers
195
- humanize_btn.click(
196
- fn=humanize_text_hf,
197
- inputs=[input_text, style_dropdown, intensity_slider],
198
- outputs=[output_text, stats_output]
199
- )
200
 
201
- # Launch for Hugging Face Spaces
202
- if __name__ == "__main__":
203
- print("🌐 Launching AI Text Humanizer on Hugging Face Spaces...")
204
- iface.launch(
205
- share=False, # HF Spaces handles sharing
206
- server_name="0.0.0.0",
207
- server_port=7860,
208
- show_error=True
209
- )
 
 
 
 
1
  import os
2
+ from fastapi import FastAPI, Header, HTTPException, Depends
3
+ from pydantic import BaseModel
4
+ from text_humanizer import TextHumanizer, download_nltk_resources
5
+ import spacy
6
 
7
+ API_KEY = os.environ.get("API_KEY", "dev-key")
8
+ PORT = int(os.environ.get("PORT", 7860))
9
 
10
+ app = FastAPI()
11
+ humanizer = None
 
 
 
 
 
 
12
 
13
+ class HumanizeReq(BaseModel):
14
+ text: str
15
+ use_passive: bool = False
16
+ use_synonyms: bool = False
17
+
18
+ def verify_key(x_api_key: str = Header(None)):
19
+ if x_api_key != API_KEY:
20
+ raise HTTPException(status_code=403, detail="Forbidden")
21
+ return True
22
+
23
+ @app.get("/")
24
+ def greet_json():
25
+ return {"Hello": "World!"}
26
+
27
+ @app.on_event("startup")
28
+ def startup():
29
+ # ensure NLTK resources and spacy model are available at runtime
30
+ download_nltk_resources()
31
  try:
32
+ spacy.load("en_core_web_sm")
33
+ except OSError:
34
+ import spacy.cli
35
+ spacy.cli.download("en_core_web_sm")
36
+ global humanizer
37
+ humanizer = TextHumanizer()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ @app.post("/humanize")
40
+ def humanize(req: HumanizeReq, _=Depends(verify_key)):
41
+ return {"humanized": humanizer.humanize_text(req.text, req.use_passive, req.use_synonyms)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # if __name__ == "__main__":
44
+ # import uvicorn
45
+ # uvicorn.run(app, host="0.0.0.0", port=PORT)
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,15 +1,5 @@
1
- fastapi==0.104.1
2
- uvicorn[standard]==0.24.0
3
- gradio==4.7.1
4
- transformers==4.35.0
5
- torch==2.1.0
6
- sentence-transformers==2.2.2
7
- nltk==3.8.1
8
- spacy>=3.7.0
9
- pydantic==2.5.0
10
- numpy==1.25.2
11
- pandas==2.1.3
12
- redis==5.0.1
13
- python-multipart==0.0.6
14
- aiofiles==23.2.1
15
- requests==2.31.0
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ spacy
4
+ nltk
5
+ sentence-transformers
 
 
 
 
 
 
 
 
 
 
text_humanizer.py CHANGED
@@ -1,390 +1,200 @@
1
- import re
2
  import random
3
- import nltk
4
- from typing import List, Dict, Optional
5
- from sentence_transformers import SentenceTransformer
6
- import numpy as np
7
- from transformers import pipeline
8
 
9
- # Download required NLTK data
10
- try:
11
- nltk.data.find('tokenizers/punkt')
12
- except LookupError:
13
- nltk.download('punkt')
14
 
15
- try:
16
- nltk.data.find('corpora/wordnet')
17
- except LookupError:
18
- nltk.download('wordnet')
19
 
20
- try:
21
- nltk.data.find('corpora/omw-1.4')
22
- except LookupError:
23
- nltk.download('omw-1.4')
24
 
25
- from nltk.tokenize import sent_tokenize, word_tokenize
26
- from nltk.corpus import wordnet
 
 
 
 
 
 
 
 
27
 
28
- class AITextHumanizer:
29
- def __init__(self):
30
- """Initialize the text humanizer with necessary models and data"""
31
- print("Loading models...")
32
-
33
- # Load sentence transformer for semantic similarity
34
  try:
35
- self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
36
  except Exception as e:
37
- print(f"Warning: Could not load similarity model: {e}")
38
- self.similarity_model = None
39
-
40
- # Initialize paraphrasing pipeline
41
- try:
42
- self.paraphraser = pipeline("text2text-generation",
43
- model="google/flan-t5-small",
44
- max_length=512)
45
- except Exception as e:
46
- print(f"Warning: Could not load paraphrasing model: {e}")
47
- self.paraphraser = None
48
-
49
- # Formal to casual word mappings
50
- self.formal_to_casual = {
51
- "utilize": "use",
52
- "demonstrate": "show",
53
- "facilitate": "help",
54
- "implement": "do",
55
- "consequently": "so",
56
- "therefore": "so",
57
- "nevertheless": "but",
58
- "furthermore": "also",
59
- "moreover": "also",
60
- "subsequently": "then",
61
- "accordingly": "so",
62
- "regarding": "about",
63
- "concerning": "about",
64
- "pertaining": "about",
65
- "approximately": "about",
66
- "endeavor": "try",
67
- "commence": "start",
68
- "terminate": "end",
69
- "obtain": "get",
70
- "purchase": "buy",
71
- "examine": "look at",
72
- "analyze": "study",
73
- "construct": "build",
74
- "establish": "set up",
75
- "magnitude": "size",
76
- "comprehensive": "complete",
77
- "significant": "big",
78
- "substantial": "large",
79
- "optimal": "best",
80
- "sufficient": "enough",
81
- "prior to": "before",
82
- "in order to": "to",
83
- "due to the fact that": "because",
84
- "at this point in time": "now",
85
- "in the event that": "if",
86
- }
87
-
88
- # Contractions mapping
89
- self.contractions = {
90
- "do not": "don't",
91
- "does not": "doesn't",
92
- "did not": "didn't",
93
- "will not": "won't",
94
- "would not": "wouldn't",
95
- "should not": "shouldn't",
96
- "could not": "couldn't",
97
- "cannot": "can't",
98
- "is not": "isn't",
99
- "are not": "aren't",
100
- "was not": "wasn't",
101
- "were not": "weren't",
102
- "have not": "haven't",
103
- "has not": "hasn't",
104
- "had not": "hadn't",
105
- "I am": "I'm",
106
- "you are": "you're",
107
- "he is": "he's",
108
- "she is": "she's",
109
- "it is": "it's",
110
- "we are": "we're",
111
- "they are": "they're",
112
- "I have": "I've",
113
- "you have": "you've",
114
- "we have": "we've",
115
- "they have": "they've",
116
- "I will": "I'll",
117
- "you will": "you'll",
118
- "he will": "he'll",
119
- "she will": "she'll",
120
- "it will": "it'll",
121
- "we will": "we'll",
122
- "they will": "they'll",
123
- }
124
-
125
- # Transition words that make text sound more AI-like
126
- self.ai_transition_words = [
127
- "Furthermore,", "Moreover,", "Additionally,", "Subsequently,",
128
- "Consequently,", "Therefore,", "Nevertheless,", "However,",
129
- "In conclusion,", "To summarize,", "In summary,", "Overall,",
130
- "It is important to note that", "It should be emphasized that",
131
- "It is worth mentioning that", "It is crucial to understand that"
132
- ]
133
-
134
- # Natural alternatives
135
- self.natural_transitions = [
136
- "Also,", "Plus,", "And,", "Then,", "So,", "But,", "Still,",
137
- "Anyway,", "By the way,", "Actually,", "Basically,",
138
- "Look,", "Listen,", "Here's the thing:", "The point is,",
139
- "What's more,", "On top of that,", "Another thing,",
140
  ]
141
-
142
- print("Humanizer initialized successfully!")
143
-
144
- def add_contractions(self, text: str) -> str:
145
- """Add contractions to make text sound more natural"""
146
- for formal, casual in self.contractions.items():
147
- # Case insensitive replacement but preserve original case
148
- pattern = re.compile(re.escape(formal), re.IGNORECASE)
149
- text = pattern.sub(casual, text)
150
- return text
151
-
152
- def replace_formal_words(self, text: str, replacement_rate: float = 0.7) -> str:
153
- """Replace formal words with casual alternatives"""
154
- words = word_tokenize(text)
155
-
156
- for i, word in enumerate(words):
157
- word_lower = word.lower()
158
- if word_lower in self.formal_to_casual and random.random() < replacement_rate:
159
- # Preserve original case
160
- if word.isupper():
161
- words[i] = self.formal_to_casual[word_lower].upper()
162
- elif word.istitle():
163
- words[i] = self.formal_to_casual[word_lower].title()
164
- else:
165
- words[i] = self.formal_to_casual[word_lower]
166
-
167
- # Reconstruct text with proper spacing
168
- result = ""
169
- for i, word in enumerate(words):
170
- if i > 0 and word not in ".,!?;:":
171
- result += " "
172
- result += word
173
-
174
- return result
175
-
176
- def vary_sentence_structure(self, text: str) -> str:
177
- """Vary sentence structure to sound more natural"""
178
- sentences = sent_tokenize(text)
179
- varied_sentences = []
180
-
181
- for sentence in sentences:
182
- # Sometimes start sentences with connecting words
183
- if random.random() < 0.3:
184
- connectors = ["Well,", "So,", "Now,", "Look,", "Actually,", "Basically,"]
185
- if not any(sentence.startswith(word) for word in connectors):
186
- sentence = random.choice(connectors) + " " + sentence.lower()
187
-
188
- # Occasionally break long sentences
189
- if len(sentence.split()) > 20 and random.random() < 0.4:
190
- words = sentence.split()
191
- mid_point = len(words) // 2
192
- # Find a natural break point near the middle
193
- for i in range(mid_point - 3, min(mid_point + 3, len(words))):
194
- if words[i] in [',', 'and', 'but', 'or', 'so']:
195
- sentence1 = ' '.join(words[:i+1])
196
- sentence2 = ' '.join(words[i+1:])
197
- if sentence2:
198
- sentence2 = sentence2[0].upper() + sentence2[1:]
199
- varied_sentences.append(sentence1)
200
- sentence = sentence2
201
- break
202
-
203
- varied_sentences.append(sentence)
204
-
205
- return ' '.join(varied_sentences)
206
-
207
- def replace_ai_transitions(self, text: str) -> str:
208
- """Replace AI-like transition words with natural alternatives"""
209
- for ai_word in self.ai_transition_words:
210
- if ai_word in text:
211
- natural_replacement = random.choice(self.natural_transitions)
212
- text = text.replace(ai_word, natural_replacement)
213
- return text
214
-
215
- def add_natural_imperfections(self, text: str, imperfection_rate: float = 0.1) -> str:
216
- """Add subtle imperfections to make text more human-like"""
217
- sentences = sent_tokenize(text)
218
- modified_sentences = []
219
-
220
- for sentence in sentences:
221
- # Occasionally start with lowercase after punctuation (casual style)
222
- if random.random() < imperfection_rate:
223
- words = sentence.split()
224
- if len(words) > 1 and words[0].lower() in ['and', 'but', 'or', 'so']:
225
- sentence = words[0].lower() + ' ' + ' '.join(words[1:])
226
-
227
- # Sometimes use informal punctuation
228
- if random.random() < imperfection_rate:
229
- if sentence.endswith('.'):
230
- sentence = sentence[:-1] # Remove period occasionally
231
- elif not sentence.endswith(('.', '!', '?')):
232
- if random.random() < 0.5:
233
- sentence += '.'
234
-
235
- modified_sentences.append(sentence)
236
-
237
- return ' '.join(modified_sentences)
238
-
239
- def paraphrase_segments(self, text: str, paraphrase_rate: float = 0.3) -> str:
240
- """Paraphrase some segments using the transformer model"""
241
- if not self.paraphraser:
242
- return text
243
-
244
- sentences = sent_tokenize(text)
245
- paraphrased_sentences = []
246
-
247
- for sentence in sentences:
248
- if random.random() < paraphrase_rate and len(sentence.split()) > 5:
249
- try:
250
- # Create paraphrase prompt
251
- prompt = f"Rewrite this sentence in a more natural, conversational way: {sentence}"
252
-
253
- result = self.paraphraser(prompt, max_length=100, num_return_sequences=1)
254
- paraphrased = result[0]['generated_text']
255
-
256
- # Clean up the result
257
- paraphrased = paraphrased.replace(prompt, '').strip()
258
- if paraphrased and len(paraphrased) > 10:
259
- paraphrased_sentences.append(paraphrased)
260
  else:
261
- paraphrased_sentences.append(sentence)
262
- except Exception as e:
263
- print(f"Paraphrasing failed: {e}")
264
- paraphrased_sentences.append(sentence)
265
  else:
266
- paraphrased_sentences.append(sentence)
267
-
268
- return ' '.join(paraphrased_sentences)
269
-
270
- def calculate_similarity(self, text1: str, text2: str) -> float:
271
- """Calculate semantic similarity between original and humanized text"""
272
- if not self.similarity_model:
273
- return 0.85 # Return reasonable default if model not available
274
-
275
- try:
276
- embeddings1 = self.similarity_model.encode([text1])
277
- embeddings2 = self.similarity_model.encode([text2])
278
- similarity = np.dot(embeddings1[0], embeddings2[0]) / (
279
- np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0])
280
- )
281
- return float(similarity)
282
- except Exception as e:
283
- print(f"Similarity calculation failed: {e}")
284
- return 0.85
285
-
286
- def humanize_text(self,
287
- text: str,
288
- style: str = "natural",
289
- intensity: float = 0.7) -> Dict:
290
- """
291
- Main humanization function
292
-
293
- Args:
294
- text: Input text to humanize
295
- style: Style of humanization ('natural', 'casual', 'conversational')
296
- intensity: Intensity of humanization (0.0 to 1.0)
297
-
298
- Returns:
299
- Dictionary with humanized text and metadata
300
- """
301
- if not text.strip():
302
- return {
303
- "original_text": text,
304
- "humanized_text": text,
305
- "similarity_score": 1.0,
306
- "changes_made": []
307
- }
308
-
309
- changes_made = []
310
- humanized_text = text
311
-
312
- # Apply transformations based on intensity
313
- if intensity > 0.2:
314
- # Replace formal words
315
- before_formal = humanized_text
316
- humanized_text = self.replace_formal_words(humanized_text, intensity * 0.7)
317
- if humanized_text != before_formal:
318
- changes_made.append("Replaced formal words with casual alternatives")
319
-
320
- if intensity > 0.3:
321
- # Add contractions
322
- before_contractions = humanized_text
323
- humanized_text = self.add_contractions(humanized_text)
324
- if humanized_text != before_contractions:
325
- changes_made.append("Added contractions")
326
-
327
- if intensity > 0.4:
328
- # Replace AI-like transitions
329
- before_transitions = humanized_text
330
- humanized_text = self.replace_ai_transitions(humanized_text)
331
- if humanized_text != before_transitions:
332
- changes_made.append("Replaced AI-like transition words")
333
-
334
- if intensity > 0.5:
335
- # Vary sentence structure
336
- before_structure = humanized_text
337
- humanized_text = self.vary_sentence_structure(humanized_text)
338
- if humanized_text != before_structure:
339
- changes_made.append("Varied sentence structure")
340
-
341
- if intensity > 0.6 and style in ["casual", "conversational"]:
342
- # Add natural imperfections
343
- before_imperfections = humanized_text
344
- humanized_text = self.add_natural_imperfections(humanized_text, intensity * 0.2)
345
- if humanized_text != before_imperfections:
346
- changes_made.append("Added natural imperfections")
347
-
348
- if intensity > 0.7:
349
- # Paraphrase some segments
350
- before_paraphrase = humanized_text
351
- humanized_text = self.paraphrase_segments(humanized_text, intensity * 0.4)
352
- if humanized_text != before_paraphrase:
353
- changes_made.append("Paraphrased some segments")
354
-
355
- # Calculate similarity
356
- similarity_score = self.calculate_similarity(text, humanized_text)
357
-
358
- return {
359
- "original_text": text,
360
- "humanized_text": humanized_text,
361
- "similarity_score": similarity_score,
362
- "changes_made": changes_made,
363
- "style": style,
364
- "intensity": intensity
365
- }
366
 
367
- # Test the humanizer
368
- if __name__ == "__main__":
369
- humanizer = AITextHumanizer()
370
-
371
- # Test text
372
- test_text = """
373
- Furthermore, it is important to note that artificial intelligence systems demonstrate
374
- significant capabilities in natural language processing tasks. Subsequently, these
375
- systems can analyze and generate text with remarkable accuracy. Nevertheless, it is
376
- crucial to understand that human oversight remains essential for optimal performance.
377
- Therefore, organizations should implement comprehensive strategies to utilize these
378
- technologies effectively while maintaining quality standards.
379
- """
380
-
381
- print("Original Text:")
382
- print(test_text)
383
- print("\n" + "="*50 + "\n")
384
-
385
- result = humanizer.humanize_text(test_text, style="conversational", intensity=0.8)
386
-
387
- print("Humanized Text:")
388
- print(result["humanized_text"])
389
- print(f"\nSimilarity Score: {result['similarity_score']:.3f}")
390
- print(f"Changes Made: {', '.join(result['changes_made'])}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ssl
2
  import random
3
+ import warnings
 
 
 
 
4
 
5
+ import nltk
6
+ import spacy
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.corpus import wordnet
9
+ from sentence_transformers import SentenceTransformer, util
10
 
11
+ warnings.filterwarnings("ignore", category=FutureWarning)
 
 
 
12
 
13
+ NLP_GLOBAL = spacy.load("en_core_web_sm")
 
 
 
14
 
15
+ def download_nltk_resources():
16
+ """
17
+ Download required NLTK resources if not already installed.
18
+ """
19
+ try:
20
+ _create_unverified_https_context = ssl._create_unverified_context
21
+ except AttributeError:
22
+ pass
23
+ else:
24
+ ssl._create_default_https_context = _create_unverified_https_context
25
 
26
+ resources = ['punkt', 'averaged_perceptron_tagger', 'punkt_tab','wordnet','averaged_perceptron_tagger_eng']
27
+ for resource in resources:
 
 
 
 
28
  try:
29
+ nltk.download(resource, quiet=True)
30
  except Exception as e:
31
+ print(f"Error downloading {resource}: {str(e)}")
32
+
33
+
34
+ # This class contains methods to humanize academic text, such as improving readability or
35
+ # simplifying complex language.
36
+ class TextHumanizer:
37
+ """
38
+ Transforms text into a more formal (academic) style:
39
+ - Expands contractions
40
+ - Adds academic transitions
41
+ - Optionally converts some sentences to passive voice
42
+ - Optionally replaces words with synonyms for more formality
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ model_name='paraphrase-MiniLM-L6-v2',
48
+ p_passive=0.2,
49
+ p_synonym_replacement=0.3,
50
+ p_academic_transition=0.3,
51
+ seed=None
52
+ ):
53
+ if seed is not None:
54
+ random.seed(seed)
55
+
56
+ self.nlp = spacy.load("en_core_web_sm")
57
+ self.model = SentenceTransformer(model_name)
58
+
59
+ # Transformation probabilities
60
+ self.p_passive = p_passive
61
+ self.p_synonym_replacement = p_synonym_replacement
62
+ self.p_academic_transition = p_academic_transition
63
+
64
+ # Common academic transitions
65
+ self.academic_transitions = [
66
+ "Moreover,", "Additionally,", "Furthermore,", "Hence,",
67
+ "Therefore,", "Consequently,", "Nonetheless,", "Nevertheless,"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  ]
69
+
70
+ def humanize_text(self, text, use_passive=False, use_synonyms=False):
71
+ doc = self.nlp(text)
72
+ transformed_sentences = []
73
+
74
+ for sent in doc.sents:
75
+ sentence_str = sent.text.strip()
76
+
77
+ # 1. Expand contractions
78
+ sentence_str = self.expand_contractions(sentence_str)
79
+
80
+ # 2. Possibly add academic transitions
81
+ # if random.random() < self.p_academic_transition:
82
+ # sentence_str = self.add_academic_transitions(sentence_str)
83
+
84
+ # 3. Optionally convert to passive
85
+ if use_passive and random.random() < self.p_passive:
86
+ sentence_str = self.convert_to_passive(sentence_str)
87
+
88
+ # 4. Optionally replace words with synonyms
89
+ if use_synonyms and random.random() < self.p_synonym_replacement:
90
+ sentence_str = self.replace_with_synonyms(sentence_str)
91
+
92
+ transformed_sentences.append(sentence_str)
93
+
94
+ return ' '.join(transformed_sentences)
95
+
96
+ def expand_contractions(self, sentence):
97
+ contraction_map = {
98
+ "n't": " not", "'re": " are", "'s": " is", "'ll": " will",
99
+ "'ve": " have", "'d": " would", "'m": " am"
100
+ }
101
+ tokens = word_tokenize(sentence)
102
+ expanded_tokens = []
103
+ for token in tokens:
104
+ lower_token = token.lower()
105
+ replaced = False
106
+ for contraction, expansion in contraction_map.items():
107
+ if contraction in lower_token and lower_token.endswith(contraction):
108
+ new_token = lower_token.replace(contraction, expansion)
109
+ if token[0].isupper():
110
+ new_token = new_token.capitalize()
111
+ expanded_tokens.append(new_token)
112
+ replaced = True
113
+ break
114
+ if not replaced:
115
+ expanded_tokens.append(token)
116
+
117
+ return ' '.join(expanded_tokens)
118
+
119
+ def add_academic_transitions(self, sentence):
120
+ transition = random.choice(self.academic_transitions)
121
+ return f"{transition} {sentence}"
122
+
123
+ def convert_to_passive(self, sentence):
124
+ doc = self.nlp(sentence)
125
+ subj_tokens = [t for t in doc if t.dep_ == 'nsubj' and t.head.dep_ == 'ROOT']
126
+ dobj_tokens = [t for t in doc if t.dep_ == 'dobj']
127
+
128
+ if subj_tokens and dobj_tokens:
129
+ subject = subj_tokens[0]
130
+ dobj = dobj_tokens[0]
131
+ verb = subject.head
132
+ if subject.i < verb.i < dobj.i:
133
+ passive_str = f"{dobj.text} {verb.lemma_} by {subject.text}"
134
+ original_str = ' '.join(token.text for token in doc)
135
+ chunk = f"{subject.text} {verb.text} {dobj.text}"
136
+ if chunk in original_str:
137
+ sentence = original_str.replace(chunk, passive_str)
138
+ return sentence
139
+
140
+ def replace_with_synonyms(self, sentence):
141
+ tokens = word_tokenize(sentence)
142
+ pos_tags = nltk.pos_tag(tokens)
143
+
144
+ new_tokens = []
145
+ for (word, pos) in pos_tags:
146
+ if pos.startswith(('J', 'N', 'V', 'R')) and wordnet.synsets(word):
147
+ if random.random() < 0.5:
148
+ synonyms = self._get_synonyms(word, pos)
149
+ if synonyms:
150
+ best_synonym = self._select_closest_synonym(word, synonyms)
151
+ new_tokens.append(best_synonym if best_synonym else word)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  else:
153
+ new_tokens.append(word)
154
+ else:
155
+ new_tokens.append(word)
 
156
  else:
157
+ new_tokens.append(word)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ # Join cleanly with punctuation fix
160
+ sentence = " ".join(new_tokens)
161
+ sentence = (
162
+ sentence.replace(" ,", ",")
163
+ .replace(" .", ".")
164
+ .replace(" !", "!")
165
+ .replace(" ?", "?")
166
+ .replace(" :", ":")
167
+ .replace(" '", "'")
168
+ )
169
+ return sentence
170
+
171
+ def _get_synonyms(self, word, pos):
172
+ wn_pos = None
173
+ if pos.startswith('J'):
174
+ wn_pos = wordnet.ADJ
175
+ elif pos.startswith('N'):
176
+ wn_pos = wordnet.NOUN
177
+ elif pos.startswith('R'):
178
+ wn_pos = wordnet.ADV
179
+ elif pos.startswith('V'):
180
+ wn_pos = wordnet.VERB
181
+
182
+ synonyms = set()
183
+ for syn in wordnet.synsets(word, pos=wn_pos):
184
+ for lemma in syn.lemmas():
185
+ lemma_name = lemma.name().replace('_', ' ')
186
+ if lemma_name.lower() != word.lower():
187
+ synonyms.add(lemma_name)
188
+ return list(synonyms)
189
+
190
+ def _select_closest_synonym(self, original_word, synonyms):
191
+ if not synonyms:
192
+ return None
193
+ original_emb = self.model.encode(original_word, convert_to_tensor=True)
194
+ synonym_embs = self.model.encode(synonyms, convert_to_tensor=True)
195
+ cos_scores = util.cos_sim(original_emb, synonym_embs)[0]
196
+ max_score_index = cos_scores.argmax().item()
197
+ max_score = cos_scores[max_score_index].item()
198
+ if max_score >= 0.5:
199
+ return synonyms[max_score_index]
200
+ return None