Spaces:

aboltachka
/

rr_detector

Sleeping

App Files Files Community

aboltachka commited on Oct 23, 2023

Commit

d7f3d30

•

1 Parent(s): 21c30ae

Upload app.py

Browse files

Files changed (1) hide show

app.py +54 -67

app.py CHANGED Viewed

@@ -219,8 +219,7 @@ group_ethnicity = [('ethnicity',0,re.compile(r'\brac[a-zA-Z]{0,3}\b')),
            ('ethnicity',75,re.compile(r'\bsioux\b')),
            ('ethnicity',76,re.compile(r'\bsiouan\b')),
            ('ethnicity',77,re.compile(r'\bchippewa[a-zA-Z]{0,3}\b')),
-           ('ethnicity',78,re.compile(r'\bchoctaw[a-zA-Z]{0,3}\b')),
-           ('ethnicity',79,re.compile(r'\brace-related\b'))] #Added by Anton
 group_blackball = [('blackball',0, re.compile(r'\bblack.{0,3}market[a-zA-Z- ]{0,3}\b')),
                   ('blackball',1, re.compile(r'\bblack.{0,3}economy\b')),
@@ -385,18 +384,25 @@ def rr_detector(title_raw, abstract_raw):
     #Append dictionaries into a data frame for Detailed statistics
     df_issue = pd.DataFrame(list(issue_count.items()), columns=['term', 'freq'])
-    df_issue['type'] = 'issue'
     df_issue = df_issue[['type', 'term', 'freq']]
     df_group = pd.DataFrame(list(group_count.items()), columns=['term', 'freq'])
-    df_group['type'] = 'group'
     df_group = df_group[['type', 'term', 'freq']]
     df_blackball = pd.DataFrame(list(blackball_count.items()), columns=['term', 'freq'])
-    df_blackball['type'] = 'whitelist'
     df_blackball = df_blackball[['type', 'term', 'freq']]
     df_details = pd.concat([df_group, df_issue, df_blackball], ignore_index=True)
     #TEXT ANALYSIS
     #Dictionary with issue, topic, and blackball keywords
@@ -405,19 +411,33 @@ def rr_detector(title_raw, abstract_raw):
     keywords_dict["group"].extend(group_count.keys())
     keywords_dict["whitelist"].extend(blackball_count.keys())
-    combined_text = f"TITLE: {title_raw}. ABSTRACT: {abstract_raw}"
-    text_analysis = []
-    for word in combined_text.split():
-        print(word)
-        if word.lower() in [item.lower() for sublist in keywords_dict.values() for item in sublist]:
-            for key, words in keywords_dict.items():
-                if word.lower() in [item.lower() for item in words]:
-                    text_analysis.append((word, key))
                     break
-        else:
-             text_analysis.append((word, None))
     #FORM THE MAIN OUTPUT
     #Output
     if result_dict['match_1'] == 1:
@@ -435,7 +455,7 @@ def rr_detector(title_raw, abstract_raw):
             output_image = os.path.join(dirname, 'images/no.png')
             #Explanation
             unique_blackball_str = ', '.join(blackball_count)
-            answer = "This paper cannot be considered race-related, as it includes the blackball phrase(s), such as: " + unique_blackball_str + "."
         else:
             #Result
             output_image = os.path.join(dirname, 'images/no.png')
@@ -444,7 +464,7 @@ def rr_detector(title_raw, abstract_raw):
             #Details
             if len(issue_count.keys()) == 0 and len(group_count.keys()) == 0 and len(blackball_count.keys()) == 0 :
                 data = {
-                        "type": ["whitelist", "issue", "group"],
                         "term": ["term1", "term2", "term3"],
                         "freq": [0, 0, 0]
                         }
@@ -517,7 +537,7 @@ abstract_smpl = "Issues of racial justice and persistent economic inequalities a
 demo = gr.Interface(fn=rr_detector, inputs=[
-        gr.Textbox(label="Title", value=title_smpl, lines=2),
         gr.Textbox(label="Abstract", value=abstract_smpl, lines=18)],
     outputs=[
         gr.Image(label = 'Result', value=def_image),
@@ -535,12 +555,15 @@ demo = gr.Interface(fn=rr_detector, inputs=[
             width = 300,
             color_legend_title = 'Type of Keywords',
             x_title = "Keywords",
-            y_title = "Frequency"
         ),
         gr.HighlightedText(
         label="Text Analysis",
-        show_legend=True,
-        color_map={"group": "yellow", "issue": "blue", "whitelist": "grey"}),
     ], theme='Jameswiller/Globe', title = title_prompt, description = description_prompt, allow_flagging = 'auto')
 #theme='gradio/monochrome'
@@ -549,54 +572,18 @@ demo = gr.Interface(fn=rr_detector, inputs=[
 if __name__ == "__main__":
     demo.launch(share=True)
-'''
-# Add default picture for output
-# Output as graph of just text but with fancy representation -- use labels from theme
-# Generate picts for output with GenAi
-#RR
-title_raw = 'When expectations work race and socioeconomic differences in school performance'
-abstract_raw = 'Why race between are expectations for future performance realized more often by some people than by others and why are such differences in the efficacy of performance expectations socially patterned we hypothesize that differences in attentiveness to performance feedback may be relevant reasoning that follow-through behaviors will be less well conceived when expectations are formed without regard to evaluation of previous performance. using data from baltimore fourth-grade students and their parents we find that expectations anticipate marks more accurately when recall of prior marks is correct than when it is incorrect. because errors of recall mostly on the high side are more common among lower-ses and minority children and their parents their school performance is affected most strongly. research on school attainment process from a motivational perspective must give more attention to the additional resources that facilitate successful goal attainment given high expectations. our perspective focuses on resources internal to the individual but external constraints also are important. the discussion stresses the need for further work in both areas.'
-title_raw = "Race-related Research in Economics disadvantaged minor race disparity"
-abstract_raw = "Issues of race disparity "
-#Default
-title_raw = "Race-related Research in Economics"
-abstract_raw = "Issues of racial justice and persistent economic inequalities across racial and ethnic groups have risen to the top of public debate. The ability of academic economists to contribute to these debates in part depends on the production of race-related research in the profession. We study the issue combining information on a corpus of 250,000 publications in economics from 1960 to 2020 on which we use an algorithmic approach to classify race-related publications, constructing paths to publication for 22,000 NBER working papers between 1974 and 2015, and constructing the career prole of publications of 2800 economics faculty in US economics departments active in 2020/1. We present four new stylized facts on race-related research in economics."
-#non-RR
-title_raw = 'Hurting stalemate or mediation the conflict over nagorno-karabakh 1990-95'
-abstract_raw = 'The impacts of six attempts to mediate the conflict over the political status of nagorno-karabakh in the caucasus region of the former soviet union were compared. each mediation was intended to get the direct parties armenia azerbaijan and nagorno-karabakh to the negotiating table. nearly 4000 events were recorded for a six-year period from 1990 through 1995. each event was coded in terms of a six-step scale ranging from a significant action toward peace 3 to substantial violence directed at an adversary -3. time-series analyses of changes in the extent of violence showed no change from before to after any of the mediations. a significant change did occur however between the months preceding and following the period of intensive combat between april 1993 and february 1994. these results support the hypothesis that a mutually hurting stalemate is a condition for negotiating a ceasefire and reduced violence between warring parties. a number of theoretical and practical implications of the findings are discussed.'
-title_raw = ""
-abstract_raw = ""
-rr_detector(title_raw, abstract_raw)
 '''
-#TEXT ANALYSIS -- IMPROVE
-# Graph: looks like when it is two words, it double count it: (this paper is about racial inequality, this paper is about racial inequality)
-    #PROBLEM OF DOUBLE COUNT: GROUP (disadvantaged minor[a-zA-Z]{0,5}) and ISSUE (disadvantage)
-def highlight_words(sentence, words):
-    for i in range(len(sentence)):
-        for j in range(len(words)):
-            if sentence.lower().startswith(words[j].lower(), i):
-                sentence = sentence[:i] + sentence[i:i+len(words[j])].upper() + sentence[i+len(words[j]):]
-    return sentence
-print(highlight_words("Have a nIcE day, you Nice person!!", ["nice"]))
-print(highlight_words("Shhh, don't be so loud!", ["loud", "Be"]))
-print(highlight_words("Automating with Python is fun", ["fun", "auTomaTiNG"]))

            ('ethnicity',75,re.compile(r'\bsioux\b')),
            ('ethnicity',76,re.compile(r'\bsiouan\b')),
            ('ethnicity',77,re.compile(r'\bchippewa[a-zA-Z]{0,3}\b')),
+           ('ethnicity',78,re.compile(r'\bchoctaw[a-zA-Z]{0,3}\b'))]
 group_blackball = [('blackball',0, re.compile(r'\bblack.{0,3}market[a-zA-Z- ]{0,3}\b')),
                   ('blackball',1, re.compile(r'\bblack.{0,3}economy\b')),
     #Append dictionaries into a data frame for Detailed statistics
     df_issue = pd.DataFrame(list(issue_count.items()), columns=['term', 'freq'])
+    df_issue['type'] = 'ISSUE'
     df_issue = df_issue[['type', 'term', 'freq']]
     df_group = pd.DataFrame(list(group_count.items()), columns=['term', 'freq'])
+    df_group['type'] = 'GROUP'
     df_group = df_group[['type', 'term', 'freq']]
     df_blackball = pd.DataFrame(list(blackball_count.items()), columns=['term', 'freq'])
+    df_blackball['type'] = 'WHITELIST'
     df_blackball = df_blackball[['type', 'term', 'freq']]
     df_details = pd.concat([df_group, df_issue, df_blackball], ignore_index=True)
+    issue_default = {'type': 'ISSUE', 'term': '', 'freq': ''}
+    group_default = {'type': 'GROUP', 'term': '', 'freq': ''}
+    blackball_default = {'type': 'WHITELIST', 'term': '', 'freq': ''}
+    df_details.loc[len(df_details)] = issue_default
+    df_details.loc[len(df_details)] = group_default
+    df_details.loc[len(df_details)] = blackball_default
+    df_details = df_details.sort_values(by='type', ascending=False)
     #TEXT ANALYSIS
     #Dictionary with issue, topic, and blackball keywords
     keywords_dict["group"].extend(group_count.keys())
     keywords_dict["whitelist"].extend(blackball_count.keys())
+    combined_text = f"TITLE:\n{title_raw} \n \nABSTRACT:\n{abstract_raw}"
+    keywords = [(word, key, len(word)) for key, words in keywords_dict.items() for word in words]
+    keywords = sorted(keywords, key=lambda x: -x[2])
+    if len(keywords) > 0:
+        pattern = re.compile("|".join(map(re.escape, [x[0] for x in keywords])), re.IGNORECASE)
+        matches = re.finditer(pattern, combined_text)
+        text_analysis = []
+        last_end = 0
+        for match in matches:
+            start = match.start()
+            end = match.end()
+            if start != last_end:
+                text_analysis.append((combined_text[last_end:start], None))
+            for keyword, key, length in keywords:
+                if re.match(re.escape(keyword), match.group(), re.IGNORECASE):
+                    text_analysis.append((combined_text[start:end], key))
                     break
+            last_end = end
+        if last_end != len(combined_text):
+            text_analysis.append((combined_text[last_end:], None))
+    else:
+        text_analysis = [(combined_text, None)]
     #FORM THE MAIN OUTPUT
     #Output
     if result_dict['match_1'] == 1:
             output_image = os.path.join(dirname, 'images/no.png')
             #Explanation
             unique_blackball_str = ', '.join(blackball_count)
+            answer = "This paper cannot be considered race-related, as it includes the whitelist phrase(s), such as: " + unique_blackball_str + "."
         else:
             #Result
             output_image = os.path.join(dirname, 'images/no.png')
             #Details
             if len(issue_count.keys()) == 0 and len(group_count.keys()) == 0 and len(blackball_count.keys()) == 0 :
                 data = {
+                        "type": ["WHITELIST", "ISSUE", "GROUP"],
                         "term": ["term1", "term2", "term3"],
                         "freq": [0, 0, 0]
                         }
 demo = gr.Interface(fn=rr_detector, inputs=[
+        gr.Textbox(label="Title", value=title_smpl, lines=1),
         gr.Textbox(label="Abstract", value=abstract_smpl, lines=18)],
     outputs=[
         gr.Image(label = 'Result', value=def_image),
             width = 300,
             color_legend_title = 'Type of Keywords',
             x_title = "Keywords",
+            y_title = "Frequency",
+            show_label = True,
+            #sort = '-x',
+            color_legend_position = 'right',
         ),
         gr.HighlightedText(
         label="Text Analysis",
+        color_map = {'group': 'blue', 'issue': 'green', 'whitelist': 'red'}
+        ),
     ], theme='Jameswiller/Globe', title = title_prompt, description = description_prompt, allow_flagging = 'auto')
 #theme='gradio/monochrome'
 if __name__ == "__main__":
     demo.launch(share=True)
 '''
+1. Double count when two word keywords
+Title: Race inequality as a concept
+Abstract: This paper is about race inequality.
+2. The original algo (mis)classify this as RR, why?
+ID: wos_rbpe_1032
+Title: residential location and the earnings of african american women
+Abstract: in comparing the earnings of african american women to three reference groupswhite women african american men and white menthree principal findings emerge. first african american women residing in the suburbs are worse off than any other suburban group. second central city african american women are worse off than any other group of central city residents. third while central city residence imposes a statistically significant earnings penalty on men of both races no such penalty is found for african american or white women. therefore african american women will enjoy no earnings advantage if they move to the suburbs. this finding underscores the importance of including women in studies of residential location and the socioeconomic status of african americans. a narrow focus on male data to inform policy is clearly insufficient. © 1995 springer. all rights reserved.
+'''