aboltachka commited on
Commit
d7f3d30
1 Parent(s): 21c30ae

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -67
app.py CHANGED
@@ -219,8 +219,7 @@ group_ethnicity = [('ethnicity',0,re.compile(r'\brac[a-zA-Z]{0,3}\b')),
219
  ('ethnicity',75,re.compile(r'\bsioux\b')),
220
  ('ethnicity',76,re.compile(r'\bsiouan\b')),
221
  ('ethnicity',77,re.compile(r'\bchippewa[a-zA-Z]{0,3}\b')),
222
- ('ethnicity',78,re.compile(r'\bchoctaw[a-zA-Z]{0,3}\b')),
223
- ('ethnicity',79,re.compile(r'\brace-related\b'))] #Added by Anton
224
 
225
  group_blackball = [('blackball',0, re.compile(r'\bblack.{0,3}market[a-zA-Z- ]{0,3}\b')),
226
  ('blackball',1, re.compile(r'\bblack.{0,3}economy\b')),
@@ -385,18 +384,25 @@ def rr_detector(title_raw, abstract_raw):
385
 
386
  #Append dictionaries into a data frame for Detailed statistics
387
  df_issue = pd.DataFrame(list(issue_count.items()), columns=['term', 'freq'])
388
- df_issue['type'] = 'issue'
389
  df_issue = df_issue[['type', 'term', 'freq']]
390
 
391
  df_group = pd.DataFrame(list(group_count.items()), columns=['term', 'freq'])
392
- df_group['type'] = 'group'
393
  df_group = df_group[['type', 'term', 'freq']]
394
 
395
  df_blackball = pd.DataFrame(list(blackball_count.items()), columns=['term', 'freq'])
396
- df_blackball['type'] = 'whitelist'
397
  df_blackball = df_blackball[['type', 'term', 'freq']]
398
 
399
  df_details = pd.concat([df_group, df_issue, df_blackball], ignore_index=True)
 
 
 
 
 
 
 
400
 
401
  #TEXT ANALYSIS
402
  #Dictionary with issue, topic, and blackball keywords
@@ -405,19 +411,33 @@ def rr_detector(title_raw, abstract_raw):
405
  keywords_dict["group"].extend(group_count.keys())
406
  keywords_dict["whitelist"].extend(blackball_count.keys())
407
 
408
- combined_text = f"TITLE: {title_raw}. ABSTRACT: {abstract_raw}"
409
-
410
- text_analysis = []
411
- for word in combined_text.split():
412
- print(word)
413
- if word.lower() in [item.lower() for sublist in keywords_dict.values() for item in sublist]:
414
- for key, words in keywords_dict.items():
415
- if word.lower() in [item.lower() for item in words]:
416
- text_analysis.append((word, key))
 
 
 
 
 
 
 
 
 
417
  break
418
- else:
419
- text_analysis.append((word, None))
420
-
 
 
 
 
 
421
  #FORM THE MAIN OUTPUT
422
  #Output
423
  if result_dict['match_1'] == 1:
@@ -435,7 +455,7 @@ def rr_detector(title_raw, abstract_raw):
435
  output_image = os.path.join(dirname, 'images/no.png')
436
  #Explanation
437
  unique_blackball_str = ', '.join(blackball_count)
438
- answer = "This paper cannot be considered race-related, as it includes the blackball phrase(s), such as: " + unique_blackball_str + "."
439
  else:
440
  #Result
441
  output_image = os.path.join(dirname, 'images/no.png')
@@ -444,7 +464,7 @@ def rr_detector(title_raw, abstract_raw):
444
  #Details
445
  if len(issue_count.keys()) == 0 and len(group_count.keys()) == 0 and len(blackball_count.keys()) == 0 :
446
  data = {
447
- "type": ["whitelist", "issue", "group"],
448
  "term": ["term1", "term2", "term3"],
449
  "freq": [0, 0, 0]
450
  }
@@ -517,7 +537,7 @@ abstract_smpl = "Issues of racial justice and persistent economic inequalities a
517
 
518
 
519
  demo = gr.Interface(fn=rr_detector, inputs=[
520
- gr.Textbox(label="Title", value=title_smpl, lines=2),
521
  gr.Textbox(label="Abstract", value=abstract_smpl, lines=18)],
522
  outputs=[
523
  gr.Image(label = 'Result', value=def_image),
@@ -535,12 +555,15 @@ demo = gr.Interface(fn=rr_detector, inputs=[
535
  width = 300,
536
  color_legend_title = 'Type of Keywords',
537
  x_title = "Keywords",
538
- y_title = "Frequency"
 
 
 
539
  ),
540
  gr.HighlightedText(
541
  label="Text Analysis",
542
- show_legend=True,
543
- color_map={"group": "yellow", "issue": "blue", "whitelist": "grey"}),
544
  ], theme='Jameswiller/Globe', title = title_prompt, description = description_prompt, allow_flagging = 'auto')
545
 
546
  #theme='gradio/monochrome'
@@ -549,54 +572,18 @@ demo = gr.Interface(fn=rr_detector, inputs=[
549
  if __name__ == "__main__":
550
  demo.launch(share=True)
551
 
552
- '''
553
- # Add default picture for output
554
- # Output as graph of just text but with fancy representation -- use labels from theme
555
- # Generate picts for output with GenAi
556
-
557
- #RR
558
- title_raw = 'When expectations work race and socioeconomic differences in school performance'
559
- abstract_raw = 'Why race between are expectations for future performance realized more often by some people than by others and why are such differences in the efficacy of performance expectations socially patterned we hypothesize that differences in attentiveness to performance feedback may be relevant reasoning that follow-through behaviors will be less well conceived when expectations are formed without regard to evaluation of previous performance. using data from baltimore fourth-grade students and their parents we find that expectations anticipate marks more accurately when recall of prior marks is correct than when it is incorrect. because errors of recall mostly on the high side are more common among lower-ses and minority children and their parents their school performance is affected most strongly. research on school attainment process from a motivational perspective must give more attention to the additional resources that facilitate successful goal attainment given high expectations. our perspective focuses on resources internal to the individual but external constraints also are important. the discussion stresses the need for further work in both areas.'
560
-
561
- title_raw = "Race-related Research in Economics disadvantaged minor race disparity"
562
- abstract_raw = "Issues of race disparity "
563
-
564
-
565
- #Default
566
- title_raw = "Race-related Research in Economics"
567
- abstract_raw = "Issues of racial justice and persistent economic inequalities across racial and ethnic groups have risen to the top of public debate. The ability of academic economists to contribute to these debates in part depends on the production of race-related research in the profession. We study the issue combining information on a corpus of 250,000 publications in economics from 1960 to 2020 on which we use an algorithmic approach to classify race-related publications, constructing paths to publication for 22,000 NBER working papers between 1974 and 2015, and constructing the career prole of publications of 2800 economics faculty in US economics departments active in 2020/1. We present four new stylized facts on race-related research in economics."
568
-
569
-
570
-
571
- #non-RR
572
- title_raw = 'Hurting stalemate or mediation the conflict over nagorno-karabakh 1990-95'
573
- abstract_raw = 'The impacts of six attempts to mediate the conflict over the political status of nagorno-karabakh in the caucasus region of the former soviet union were compared. each mediation was intended to get the direct parties armenia azerbaijan and nagorno-karabakh to the negotiating table. nearly 4000 events were recorded for a six-year period from 1990 through 1995. each event was coded in terms of a six-step scale ranging from a significant action toward peace 3 to substantial violence directed at an adversary -3. time-series analyses of changes in the extent of violence showed no change from before to after any of the mediations. a significant change did occur however between the months preceding and following the period of intensive combat between april 1993 and february 1994. these results support the hypothesis that a mutually hurting stalemate is a condition for negotiating a ceasefire and reduced violence between warring parties. a number of theoretical and practical implications of the findings are discussed.'
574
-
575
- title_raw = ""
576
- abstract_raw = ""
577
-
578
- rr_detector(title_raw, abstract_raw)
579
 
580
  '''
581
 
 
582
 
 
 
583
 
 
584
 
 
 
 
585
 
586
- #TEXT ANALYSIS -- IMPROVE
587
-
588
- # Graph: looks like when it is two words, it double count it: (this paper is about racial inequality, this paper is about racial inequality)
589
- #PROBLEM OF DOUBLE COUNT: GROUP (disadvantaged minor[a-zA-Z]{0,5}) and ISSUE (disadvantage)
590
-
591
-
592
-
593
- def highlight_words(sentence, words):
594
- for i in range(len(sentence)):
595
- for j in range(len(words)):
596
- if sentence.lower().startswith(words[j].lower(), i):
597
- sentence = sentence[:i] + sentence[i:i+len(words[j])].upper() + sentence[i+len(words[j]):]
598
- return sentence
599
-
600
- print(highlight_words("Have a nIcE day, you Nice person!!", ["nice"]))
601
- print(highlight_words("Shhh, don't be so loud!", ["loud", "Be"]))
602
- print(highlight_words("Automating with Python is fun", ["fun", "auTomaTiNG"]))
 
219
  ('ethnicity',75,re.compile(r'\bsioux\b')),
220
  ('ethnicity',76,re.compile(r'\bsiouan\b')),
221
  ('ethnicity',77,re.compile(r'\bchippewa[a-zA-Z]{0,3}\b')),
222
+ ('ethnicity',78,re.compile(r'\bchoctaw[a-zA-Z]{0,3}\b'))]
 
223
 
224
  group_blackball = [('blackball',0, re.compile(r'\bblack.{0,3}market[a-zA-Z- ]{0,3}\b')),
225
  ('blackball',1, re.compile(r'\bblack.{0,3}economy\b')),
 
384
 
385
  #Append dictionaries into a data frame for Detailed statistics
386
  df_issue = pd.DataFrame(list(issue_count.items()), columns=['term', 'freq'])
387
+ df_issue['type'] = 'ISSUE'
388
  df_issue = df_issue[['type', 'term', 'freq']]
389
 
390
  df_group = pd.DataFrame(list(group_count.items()), columns=['term', 'freq'])
391
+ df_group['type'] = 'GROUP'
392
  df_group = df_group[['type', 'term', 'freq']]
393
 
394
  df_blackball = pd.DataFrame(list(blackball_count.items()), columns=['term', 'freq'])
395
+ df_blackball['type'] = 'WHITELIST'
396
  df_blackball = df_blackball[['type', 'term', 'freq']]
397
 
398
  df_details = pd.concat([df_group, df_issue, df_blackball], ignore_index=True)
399
+ issue_default = {'type': 'ISSUE', 'term': '', 'freq': ''}
400
+ group_default = {'type': 'GROUP', 'term': '', 'freq': ''}
401
+ blackball_default = {'type': 'WHITELIST', 'term': '', 'freq': ''}
402
+ df_details.loc[len(df_details)] = issue_default
403
+ df_details.loc[len(df_details)] = group_default
404
+ df_details.loc[len(df_details)] = blackball_default
405
+ df_details = df_details.sort_values(by='type', ascending=False)
406
 
407
  #TEXT ANALYSIS
408
  #Dictionary with issue, topic, and blackball keywords
 
411
  keywords_dict["group"].extend(group_count.keys())
412
  keywords_dict["whitelist"].extend(blackball_count.keys())
413
 
414
+ combined_text = f"TITLE:\n{title_raw} \n \nABSTRACT:\n{abstract_raw}"
415
+
416
+ keywords = [(word, key, len(word)) for key, words in keywords_dict.items() for word in words]
417
+ keywords = sorted(keywords, key=lambda x: -x[2])
418
+
419
+ if len(keywords) > 0:
420
+ pattern = re.compile("|".join(map(re.escape, [x[0] for x in keywords])), re.IGNORECASE)
421
+ matches = re.finditer(pattern, combined_text)
422
+ text_analysis = []
423
+ last_end = 0
424
+ for match in matches:
425
+ start = match.start()
426
+ end = match.end()
427
+ if start != last_end:
428
+ text_analysis.append((combined_text[last_end:start], None))
429
+ for keyword, key, length in keywords:
430
+ if re.match(re.escape(keyword), match.group(), re.IGNORECASE):
431
+ text_analysis.append((combined_text[start:end], key))
432
  break
433
+ last_end = end
434
+
435
+ if last_end != len(combined_text):
436
+ text_analysis.append((combined_text[last_end:], None))
437
+
438
+ else:
439
+ text_analysis = [(combined_text, None)]
440
+
441
  #FORM THE MAIN OUTPUT
442
  #Output
443
  if result_dict['match_1'] == 1:
 
455
  output_image = os.path.join(dirname, 'images/no.png')
456
  #Explanation
457
  unique_blackball_str = ', '.join(blackball_count)
458
+ answer = "This paper cannot be considered race-related, as it includes the whitelist phrase(s), such as: " + unique_blackball_str + "."
459
  else:
460
  #Result
461
  output_image = os.path.join(dirname, 'images/no.png')
 
464
  #Details
465
  if len(issue_count.keys()) == 0 and len(group_count.keys()) == 0 and len(blackball_count.keys()) == 0 :
466
  data = {
467
+ "type": ["WHITELIST", "ISSUE", "GROUP"],
468
  "term": ["term1", "term2", "term3"],
469
  "freq": [0, 0, 0]
470
  }
 
537
 
538
 
539
  demo = gr.Interface(fn=rr_detector, inputs=[
540
+ gr.Textbox(label="Title", value=title_smpl, lines=1),
541
  gr.Textbox(label="Abstract", value=abstract_smpl, lines=18)],
542
  outputs=[
543
  gr.Image(label = 'Result', value=def_image),
 
555
  width = 300,
556
  color_legend_title = 'Type of Keywords',
557
  x_title = "Keywords",
558
+ y_title = "Frequency",
559
+ show_label = True,
560
+ #sort = '-x',
561
+ color_legend_position = 'right',
562
  ),
563
  gr.HighlightedText(
564
  label="Text Analysis",
565
+ color_map = {'group': 'blue', 'issue': 'green', 'whitelist': 'red'}
566
+ ),
567
  ], theme='Jameswiller/Globe', title = title_prompt, description = description_prompt, allow_flagging = 'auto')
568
 
569
  #theme='gradio/monochrome'
 
572
  if __name__ == "__main__":
573
  demo.launch(share=True)
574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
  '''
577
 
578
+ 1. Double count when two word keywords
579
 
580
+ Title: Race inequality as a concept
581
+ Abstract: This paper is about race inequality.
582
 
583
+ 2. The original algo (mis)classify this as RR, why?
584
 
585
+ ID: wos_rbpe_1032
586
+ Title: residential location and the earnings of african american women
587
+ Abstract: in comparing the earnings of african american women to three reference groupswhite women african american men and white menthree principal findings emerge. first african american women residing in the suburbs are worse off than any other suburban group. second central city african american women are worse off than any other group of central city residents. third while central city residence imposes a statistically significant earnings penalty on men of both races no such penalty is found for african american or white women. therefore african american women will enjoy no earnings advantage if they move to the suburbs. this finding underscores the importance of including women in studies of residential location and the socioeconomic status of african americans. a narrow focus on male data to inform policy is clearly insufficient. © 1995 springer. all rights reserved.
588
 
589
+ '''