Spaces:

mohdelgaar
/

LingConv

Running

App Files Files Community

mohdelgaar commited on Aug 5, 2024

Commit

20b7679

0 Parent(s):

Initial commit

Browse files

Files changed (9) hide show

.gitattributes +36 -0
README.md +10 -0
app.py +42 -0
compute_lng.py +57 -0
const.py +1053 -0
demo.py +371 -0
model.py +696 -0
options.py +158 -0
requirements.txt +10 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.npy filter=lfs diff=lfs merge=lfs -text
+*.jar filter=lfs diff=lfs merge=lfs -text
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.state filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: LingConv
+emoji: 🔁
+colorFrom: indigo
+colorTo: pink
+sdk: gradio
+sdk_version: 4.40.0
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import nltk
+nltk.download('wordnet')
+import spacy
+spacy.cli.download('en_core_web_sm')
+from const import name_map
+from demo import run_gradio
+from model import EncoderDecoderVAE
+from options import parse_args
+import numpy as np
+from transformers import T5Tokenizer
+import torch
+import joblib
+import pandas as pd
+def process_examples(samples, full_names):
+    for i in range(len(samples)):
+        sample = samples[i]
+        input_text = tokenizer.decode(sample['sentence1_input_ids'], skip_special_tokens=True)
+        ling1 = scaler.inverse_transform([sample['sentence1_ling']])[0]
+        ling2 = scaler.inverse_transform([sample['sentence2_ling']])[0]
+        ling = pd.DataFrame({'Index': full_names, 'Source': ling1, 'Target': ling2})
+        samples[i] = [input_text, ling]
+    return list(samples)
+args, args_list, lng_names = parse_args(ckpt='./ckpt/model.pt')
+tokenizer = T5Tokenizer.from_pretrained(args.model_name)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+scaler = joblib.load('assets/scaler.bin')
+full_names = [name_map[x] for x in lng_names]
+samples = joblib.load('assets/samples.bin')
+examples = process_examples(samples, full_names)
+ling_collection = np.load('assets/ling_collection.npy')
+model = EncoderDecoderVAE(args, tokenizer.pad_token_id, tokenizer.get_vocab()['</s>']).to(device)
+state = torch.load(args.ckpt, map_location=torch.device('cpu'))
+model.load_state_dict(state['model'], strict=False)
+model.eval()
+run_gradio(model, tokenizer, scaler, ling_collection, examples, full_names)

compute_lng.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from lng.lca.lc_anc import lca
+from lng.L2SCA.analyzeText import sca
+import lftk
+import spacy
+nlp = spacy.load("en_core_web_sm")
+def extract_lingfeat(text):
+    from lingfeat import extractor
+    LingFeat = extractor.pass_text(text)
+    LingFeat.preprocess()
+    d = {}
+    d.update(LingFeat.WoKF_()) # Wikipedia Knowledge Features
+    d.update(LingFeat.WBKF_()) # WeeBit Corpus Knowledge Features
+    d.update(LingFeat.OSKF_()) # OneStopEng Corpus Knowledge Features
+    # Discourse (Disco) Features
+    d.update(LingFeat.EnDF_()) # Entity Density Features
+    d.update(LingFeat.EnGF_()) # Entity Grid Features
+    # Syntactic (Synta) Features
+    # d.update(LingFeat.PhrF_()) # Noun/Verb/Adj/Adv/... Phrasal Features (logging stanza)
+    # d.update(LingFeat.TrSF_()) # (Parse) Tree Structural Features (logging stanza)
+    d.update(LingFeat.POSF_()) # Noun/Verb/Adj/Adv/... Part-of-Speech Features
+    # Lexico Semantic (LxSem) Features
+    d.update(LingFeat.TTRF_()) # Type Token Ratio Features
+    d.update(LingFeat.VarF_()) # Noun/Verb/Adj/Adv Variation Features
+    d.update(LingFeat.PsyF_()) # Psycholinguistic Difficulty of Words (AoA Kuperman)
+    d.update(LingFeat.WorF_()) # Word Familiarity from Frequency Count (SubtlexUS)
+    # Shallow Traditional (ShTra) Features
+    d.update(LingFeat.ShaF_()) # Shallow Features (e.g. avg number of tokens)
+    d.update(LingFeat.TraF_()) # Traditional Formulas
+    return list(d.values())
+def extract_lftk(text):
+    if text == '':
+        return [0.] * 220
+    doc = nlp(text)
+    LFTK = lftk.Extractor(doc)
+    feats = LFTK.extract()
+    return list(feats.values())
+def compute_lng(text, shortcut = False):
+    lca_feats = lca(text)
+    if shortcut:
+        sca_feats = [0] * 23
+    else:
+        sca_feats = sca(text)
+    lftk = extract_lftk(text)
+    all_feats = lca_feats + sca_feats + lftk
+    return all_feats

const.py ADDED Viewed

	@@ -0,0 +1,1053 @@

+import pandas as pd
+sca_names = "W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C-S,VP-T,C-T,DC-C,DC-T,T-S,\
+CT-T,CP-T,CP-C,CN-T,CN-C".split(',')
+lca_names = "wordtypes,swordtypes,lextypes,slextypes,wordtokens,swordtokens,\
+lextokens,slextokens,ld,ls1,ls2,vs1,vs2,cvs1,ndw,ndwz,ndwerz,ndwesz,ttr,\
+msttr,cttr,rttr,logttr,uber,lv,vv1,svv1,cvv1,vv2,nv,adjv,advv,modv".split(',')
+lftk_names = [
+        't_word', 't_stopword', 't_punct', 't_syll', 't_syll2', 't_syll3', 't_uword', 't_sent', 't_char', 'a_word_ps', 'a_char_ps',
+        'a_char_pw', 'a_syll_ps', 'a_syll_pw', 'a_stopword_ps', 'a_stopword_pw', 't_kup', 't_bry', 't_subtlex_us_zipf', 'a_kup_pw',
+        'a_bry_pw', 'a_kup_ps', 'a_bry_ps', 'a_subtlex_us_zipf_pw', 'a_subtlex_us_zipf_ps', 't_n_ent', 't_n_ent_person', 't_n_ent_norp',
+        't_n_ent_fac', 't_n_ent_org', 't_n_ent_gpe', 't_n_ent_loc', 't_n_ent_product', 't_n_ent_event', 't_n_ent_art', 't_n_ent_law',
+        't_n_ent_language', 't_n_ent_date', 't_n_ent_time', 't_n_ent_percent', 't_n_ent_money', 't_n_ent_quantity', 't_n_ent_ordinal',
+        't_n_ent_cardinal', 'a_n_ent_pw', 'a_n_ent_person_pw', 'a_n_ent_norp_pw', 'a_n_ent_fac_pw', 'a_n_ent_org_pw', 'a_n_ent_gpe_pw',
+        'a_n_ent_loc_pw', 'a_n_ent_product_pw', 'a_n_ent_event_pw', 'a_n_ent_art_pw', 'a_n_ent_law_pw', 'a_n_ent_language_pw',
+        'a_n_ent_date_pw', 'a_n_ent_time_pw', 'a_n_ent_percent_pw', 'a_n_ent_money_pw', 'a_n_ent_quantity_pw', 'a_n_ent_ordinal_pw',
+        'a_n_ent_cardinal_pw', 'a_n_ent_ps', 'a_n_ent_person_ps', 'a_n_ent_norp_ps', 'a_n_ent_fac_ps', 'a_n_ent_org_ps', 'a_n_ent_gpe_ps',
+        'a_n_ent_loc_ps', 'a_n_ent_product_ps', 'a_n_ent_event_ps', 'a_n_ent_art_ps', 'a_n_ent_law_ps', 'a_n_ent_language_ps',
+        'a_n_ent_date_ps', 'a_n_ent_time_ps', 'a_n_ent_percent_ps', 'a_n_ent_money_ps', 'a_n_ent_quantity_ps', 'a_n_ent_ordinal_ps',
+        'a_n_ent_cardinal_ps', 'simp_adj_var', 'simp_adp_var', 'simp_adv_var', 'simp_aux_var', 'simp_cconj_var', 'simp_det_var',
+        'simp_intj_var', 'simp_noun_var', 'simp_num_var', 'simp_part_var', 'simp_pron_var', 'simp_propn_var', 'simp_punct_var',
+        'simp_sconj_var', 'simp_sym_var', 'simp_verb_var', 'simp_space_var', 'root_adj_var', 'root_adp_var', 'root_adv_var', 'root_aux_var',
+        'root_cconj_var', 'root_det_var', 'root_intj_var', 'root_noun_var', 'root_num_var', 'root_part_var', 'root_pron_var', 'root_propn_var',
+        'root_punct_var', 'root_sconj_var', 'root_sym_var', 'root_verb_var', 'root_space_var', 'corr_adj_var', 'corr_adp_var', 'corr_adv_var',
+        'corr_aux_var', 'corr_cconj_var', 'corr_det_var', 'corr_intj_var', 'corr_noun_var', 'corr_num_var', 'corr_part_var', 'corr_pron_var',
+        'corr_propn_var', 'corr_punct_var', 'corr_sconj_var', 'corr_sym_var', 'corr_verb_var', 'corr_space_var', 'simp_ttr', 'root_ttr',
+        'corr_ttr', 'bilog_ttr', 'uber_ttr', 'simp_ttr_no_lem', 'root_ttr_no_lem', 'corr_ttr_no_lem', 'bilog_ttr_no_lem', 'uber_ttr_no_lem',
+        'n_adj', 'n_adp', 'n_adv', 'n_aux', 'n_cconj', 'n_det', 'n_intj', 'n_noun', 'n_num', 'n_part', 'n_pron', 'n_propn', 'n_punct',
+        'n_sconj', 'n_sym', 'n_verb', 'n_space', 'n_uadj', 'n_uadp', 'n_uadv', 'n_uaux', 'n_ucconj', 'n_udet', 'n_uintj', 'n_unoun',
+        'n_unum', 'n_upart', 'n_upron', 'n_upropn', 'n_upunct', 'n_usconj', 'n_usym', 'n_uverb', 'n_uspace', 'a_adj_pw', 'a_adp_pw',
+        'a_adv_pw', 'a_aux_pw', 'a_cconj_pw', 'a_det_pw', 'a_intj_pw', 'a_noun_pw', 'a_num_pw', 'a_part_pw', 'a_pron_pw', 'a_propn_pw',
+        'a_punct_pw', 'a_sconj_pw', 'a_sym_pw', 'a_verb_pw', 'a_space_pw', 'a_adj_ps', 'a_adp_ps', 'a_adv_ps', 'a_aux_ps', 'a_cconj_ps',
+        'a_det_ps', 'a_intj_ps', 'a_noun_ps', 'a_num_ps', 'a_part_ps', 'a_pron_ps', 'a_propn_ps', 'a_punct_ps', 'a_sconj_ps', 'a_sym_ps',
+        'a_verb_ps', 'a_space_ps', 'fkre', 'fkgl', 'fogi', 'smog', 'cole', 'auto', 'rt_fast', 'rt_average', 'rt_slow']
+lftk_full_names = ['Total Number Of Words', 'Total Number Of Stop Words',
+        'Total Number Of Puntuations', 'Total Number Of Syllables',
+        'Total Number Of Words More Than Two Syllables', 'Total Number Of Words More Than Three Syllables',
+        'Total Number Of Unique Words', 'Total Number Of Sentences',
+        'Total Number Of Characters', 'Average Number Of Words Per Sentence',
+        'Average Number Of Characters Per Sentence', 'Average Number Of Characters Per Word',
+        'Average Number Of Syllables Per Sentence', 'Average Number Of Syllables Per Word',
+        'Average Number Of Stop Words Per Sentence', 'Average Number Of Stop Words Per Word',
+        'Total Kuperman Age Of Acquistion Of Words', 'Total Brysbaert Age Of Acquistion Of Words',
+        'Total Subtlex Us Zipf Of Words', 'Average Kuperman Age Of Acquistion Of Words Per Word',
+        'Average Brysbaert Age Of Acquistion Of Words Per Word', 'Average Kuperman Age Of Acquistion Of Words Per Sentence',
+        'Average Brysbaert Age Of Acquistion Of Words Per Sentence', 'Average Subtlex Us Zipf Of Words Per Word',
+        'Average Subtlex Us Zipf Of Words Per Sentence', 'Total Number Of Named Entities',
+        'Total Number Of Named Entities Person', 'Total Number Of Named Entities Norp',
+        'Total Number Of Named Entities Fac', 'Total Number Of Named Entities Org',
+        'Total Number Of Named Entities Gpe', 'Total Number Of Named Entities Loc',
+        'Total Number Of Named Entities Product', 'Total Number Of Named Entities Event',
+        'Total Number Of Named Entities Art', 'Total Number Of Named Entities Law',
+        'Total Number Of Named Entities Language', 'Total Number Of Named Entities Date',
+        'Total Number Of Named Entities Time', 'Total Number Of Named Entities Percent',
+        'Total Number Of Named Entities Money', 'Total Number Of Named Entities Quantity',
+        'Total Number Of Named Entities Ordinal', 'Total Number Of Named Entities Cardinal',
+        'Average Number Of Named Entities Per Word', 'Average Number Of Named Entities Person Per Word',
+        'Average Number Of Named Entities Norp Per Word', 'Average Number Of Named Entities Fac Per Word',
+        'Average Number Of Named Entities Org Per Word', 'Average Number Of Named Entities Gpe Per Word',
+        'Average Number Of Named Entities Loc Per Word', 'Average Number Of Named Entities Product Per Word',
+        'Average Number Of Named Entities Event Per Word', 'Average Number Of Named Entities Art Per Word',
+        'Average Number Of Named Entities Law Per Word', 'Average Number Of Named Entities Language Per Word',
+        'Average Number Of Named Entities Date Per Word', 'Average Number Of Named Entities Time Per Word',
+        'Average Number Of Named Entities Percent Per Word', 'Average Number Of Named Entities Money Per Word',
+        'Average Number Of Named Entities Quantity Per Word', 'Average Number Of Named Entities Ordinal Per Word',
+        'Average Number Of Named Entities Cardinal Per Word', 'Average Number Of Named Entities Per Sentence',
+        'Average Number Of Named Entities Person Per Sentence', 'Average Number Of Named Entities Norp Per Sentence',
+        'Average Number Of Named Entities Fac Per Sentence', 'Average Number Of Named Entities Org Per Sentence',
+        'Average Number Of Named Entities Gpe Per Sentence', 'Average Number Of Named Entities Loc Per Sentence',
+        'Average Number Of Named Entities Product Per Sentence', 'Average Number Of Named Entities Event Per Sentence',
+        'Average Number Of Named Entities Art Per Sentence', 'Average Number Of Named Entities Law Per Sentence',
+        'Average Number Of Named Entities Language Per Sentence', 'Average Number Of Named Entities Date Per Sentence',
+        'Average Number Of Named Entities Time Per Sentence', 'Average Number Of Named Entities Percent Per Sentence',
+        'Average Number Of Named Entities Money Per Sentence', 'Average Number Of Named Entities Quantity Per Sentence',
+        'Average Number Of Named Entities Ordinal Per Sentence', 'Average Number Of Named Entities Cardinal Per Sentence',
+        'Simple Adjectives Variation', 'Simple Adpositions Variation',
+        'Simple Adverbs Variation', 'Simple Auxiliaries Variation',
+        'Simple Coordinating Conjunctions Variation', 'Simple Determiners Variation',
+        'Simple Interjections Variation', 'Simple Nouns Variation',
+        'Simple Numerals Variation', 'Simple Particles Variation',
+        'Simple Pronouns Variation', 'Simple Proper Nouns Variation',
+        'Simple Punctuations Variation', 'Simple Subordinating Conjunctions Variation',
+        'Simple Symbols Variation', 'Simple Verbs Variation',
+        'Simple Spaces Variation', 'Root Adjectives Variation',
+        'Root Adpositions Variation', 'Root Adverbs Variation',
+        'Root Auxiliaries Variation', 'Root Coordinating Conjunctions Variation',
+        'Root Determiners Variation', 'Root Interjections Variation',
+        'Root Nouns Variation', 'Root Numerals Variation',
+        'Root Particles Variation', 'Root Pronouns Variation',
+        'Root Proper Nouns Variation', 'Root Punctuations Variation',
+        'Root Subordinating Conjunctions Variation', 'Root Symbols Variation',
+        'Root Verbs Variation', 'Root Spaces Variation',
+        'Corrected Adjectives Variation', 'Corrected Adpositions Variation',
+        'Corrected Adverbs Variation', 'Corrected Auxiliaries Variation',
+        'Corrected Coordinating Conjunctions Variation', 'Corrected Determiners Variation',
+        'Corrected Interjections Variation', 'Corrected Nouns Variation',
+        'Corrected Numerals Variation', 'Corrected Particles Variation',
+        'Corrected Pronouns Variation', 'Corrected Proper Nouns Variation',
+        'Corrected Punctuations Variation', 'Corrected Subordinating Conjunctions Variation',
+        'Corrected Symbols Variation', 'Corrected Verbs Variation',
+        'Corrected Spaces Variation', 'Simple Type Token Ratio',
+        'Root Type Token Ratio', 'Corrected Type Token Ratio',
+        'Bilogarithmic Type Token Ratio', 'Uber Type Token Ratio',
+        'Simple Type Token Ratio No Lemma', 'Root Type Token Ratio No Lemma',
+        'Corrected Type Token Ratio No Lemma', 'Bilogarithmic Type Token Ratio No Lemma',
+        'Uber Type Token Ratio No Lemma', 'Total Number Of Adjectives',
+        'Total Number Of Adpositions', 'Total Number Of Adverbs',
+        'Total Number Of Auxiliaries', 'Total Number Of Coordinating Conjunctions',
+        'Total Number Of Determiners', 'Total Number Of Interjections',
+        'Total Number Of Nouns', 'Total Number Of Numerals',
+        'Total Number Of Particles', 'Total Number Of Pronouns',
+        'Total Number Of Proper Nouns', 'Total Number Of Punctuations',
+        'Total Number Of Subordinating Conjunctions', 'Total Number Of Symbols',
+        'Total Number Of Verbs', 'Total Number Of Spaces',
+        'Total Number Of Unique Adjectives', 'Total Number Of Unique Adpositions',
+        'Total Number Of Unique Adverbs', 'Total Number Of Unique Auxiliaries',
+        'Total Number Of Unique Coordinating Conjunctions', 'Total Number Of Unique Determiners',
+        'Total Number Of Unique Interjections', 'Total Number Of Unique Nouns',
+        'Total Number Of Unique Numerals', 'Total Number Of Unique Particles',
+        'Total Number Of Unique Pronouns', 'Total Number Of Unique Proper Nouns',
+        'Total Number Of Unique Punctuations', 'Total Number Of Unique Subordinating Conjunctions',
+        'Total Number Of Unique Symbols', 'Total Number Of Unique Verbs',
+        'Total Number Of Unique Spaces', 'Average Number Of Adjectives Per Word',
+        'Average Number Of Adpositions Per Word', 'Average Number Of Adverbs Per Word',
+        'Average Number Of Auxiliaries Per Word', 'Average Number Of Coordinating Conjunctions Per Word',
+        'Average Number Of Determiners Per Word', 'Average Number Of Interjections Per Word',
+        'Average Number Of Nouns Per Word', 'Average Number Of Numerals Per Word',
+        'Average Number Of Particles Per Word', 'Average Number Of Pronouns Per Word',
+        'Average Number Of Proper Nouns Per Word', 'Average Number Of Punctuations Per Word',
+        'Average Number Of Subordinating Conjunctions Per Word', 'Average Number Of Symbols Per Word',
+        'Average Number Of Verbs Per Word', 'Average Number Of Spaces Per Word',
+        'Average Number Of Adjectives Per Sentence', 'Average Number Of Adpositions Per Sentence',
+        'Average Number Of Adverbs Per Sentence', 'Average Number Of Auxiliaries Per Sentence',
+        'Average Number Of Coordinating Conjunctions Per Sentence', 'Average Number Of Determiners Per Sentence',
+        'Average Number Of Interjections Per Sentence', 'Average Number Of Nouns Per Sentence',
+        'Average Number Of Numerals Per Sentence', 'Average Number Of Particles Per Sentence',
+        'Average Number Of Pronouns Per Sentence', 'Average Number Of Proper Nouns Per Sentence',
+        'Average Number Of Punctuations Per Sentence', 'Average Number Of Subordinating Conjunctions Per Sentence',
+        'Average Number Of Symbols Per Sentence', 'Average Number Of Verbs Per Sentence',
+        'Average Number Of Spaces Per Sentence', 'Flesch Kincaid Reading Ease',
+        'Flesch Kincaid Grade Level', 'Gunning Fog Index',
+        'Smog Index', 'Coleman Liau Index',
+        'Automated Readability Index', 'Reading Time For Fast Readers',
+        'Reading Time For Average Readers', 'Reading Time For Slow Readers']
+full_names = [
+'Unique words',
+'Unique sophisticated words',
+'Unique lexical words',
+'Unique sophisticated lexical words',
+'Total words',
+'Total sophisticated words',
+'Total lexical words',
+'Total sophisticated lexical words',
+'Lexical density',
+'Lexical sophistication (total)',
+'Lexical sophistication (unique)',
+'Verb sophistication',
+'Verb sophistication (squared numerator)',
+'Verb sophistication (sqrt denominator)',
+'Unique words',
+'Unique words in first k tokens',
+'Unique words in random k tokens (average of 10 samples)',
+'Unique words in random sequence of k words (average of 10 samples)',
+'Ratio of unique words',
+'Mean TTR of all k word segments',
+'Corrected TTR (sqrt(2N) denominator)',
+'Root TTR (sqrt(N) denominator)',
+'Log TTR',
+'Uber',
+'D Measure',
+'Ratio of unique verbs',
+'Verb variation with squared numerator',
+'Verb variation with (sqrt(2N)) denominator',
+'Verb variation over all lexical words',
+'Noun variation',
+'Adjective variation',
+'Adverb variation',
+'(Ajd + Adv) variation',
+'# words',
+'# sentences',
+'# verb phrases',
+'# clauses',
+'# T-units',
+'# dependent clauses',
+'# complex T-units',
+'# coordinate phrases',
+'# complex nominals',
+'Mean length of sentence',
+'Mean length of T-unit',
+'Mean unit of clause',
+'Clauses per sentence',
+'Verb phrases per T-unit',
+'Clauses per T-unit',
+'Dependent clause ratio',
+'Dependent clause per T-unit',
+'T-units per sentence',
+'Complex T-unit ratio',
+'Coordinate phrases per T-unit',
+'Coordinate phrases per clause',
+'Complex nominals per T-unit',
+'Complex nominals per clause',
+]
+lingfeat_names = [
+        'WRich05_S', 'WRich10_S', 'WRich15_S', 'WRich20_S', 'WClar05_S', 'WClar10_S',
+        'WClar15_S', 'WClar20_S', 'WNois05_S', 'WNois10_S', 'WNois15_S', 'WNois20_S',
+        'WTopc05_S', 'WTopc10_S', 'WTopc15_S', 'WTopc20_S', 'BRich05_S', 'BRich10_S',
+        'BRich15_S', 'BRich20_S', 'BClar05_S', 'BClar10_S', 'BClar15_S', 'BClar20_S',
+        'BNois05_S', 'BNois10_S', 'BNois15_S', 'BNois20_S', 'BTopc05_S', 'BTopc10_S',
+        'BTopc15_S', 'BTopc20_S', 'to_EntiM_C', 'as_EntiM_C', 'at_EntiM_C', 'to_UEnti_C',
+        'as_UEnti_C', 'at_UEnti_C', 'ra_SSTo_C', 'ra_SOTo_C', 'ra_SXTo_C', 'ra_SNTo_C',
+        'ra_OSTo_C', 'ra_OOTo_C', 'ra_OXTo_C', 'ra_ONTo_C', 'ra_XSTo_C', 'ra_XOTo_C',
+        'ra_XXTo_C', 'ra_XNTo_C', 'ra_NSTo_C', 'ra_NOTo_C', 'ra_NXTo_C', 'ra_NNTo_C',
+        'LoCohPA_S', 'LoCohPW_S', 'LoCohPU_S', 'LoCoDPA_S', 'LoCoDPW_S', 'LoCoDPU_S',
+        'to_NoTag_C', 'as_NoTag_C', 'at_NoTag_C', 'ra_NoAjT_C', 'ra_NoVeT_C', 'ra_NoAvT_C',
+        'ra_NoSuT_C', 'ra_NoCoT_C', 'to_VeTag_C', 'as_VeTag_C', 'at_VeTag_C', 'ra_VeAjT_C',
+        'ra_VeNoT_C', 'ra_VeAvT_C', 'ra_VeSuT_C', 'ra_VeCoT_C', 'to_AjTag_C', 'as_AjTag_C',
+        'at_AjTag_C', 'ra_AjNoT_C', 'ra_AjVeT_C', 'ra_AjAvT_C', 'ra_AjSuT_C', 'ra_AjCoT_C',
+        'to_AvTag_C', 'as_AvTag_C', 'at_AvTag_C', 'ra_AvAjT_C', 'ra_AvNoT_C', 'ra_AvVeT_C',
+        'ra_AvSuT_C', 'ra_AvCoT_C', 'to_SuTag_C', 'as_SuTag_C', 'at_SuTag_C', 'ra_SuAjT_C',
+        'ra_SuNoT_C', 'ra_SuVeT_C', 'ra_SuAvT_C', 'ra_SuCoT_C', 'to_CoTag_C', 'as_CoTag_C',
+        'at_CoTag_C', 'ra_CoAjT_C', 'ra_CoNoT_C', 'ra_CoVeT_C', 'ra_CoAvT_C', 'ra_CoSuT_C',
+        'to_ContW_C', 'as_ContW_C', 'at_ContW_C', 'to_FuncW_C', 'as_FuncW_C', 'at_FuncW_C',
+        'ra_CoFuW_C', 'SimpTTR_S', 'CorrTTR_S', 'BiLoTTR_S', 'UberTTR_S', 'MTLDTTR_S',
+        'SimpNoV_S', 'SquaNoV_S', 'CorrNoV_S', 'SimpVeV_S', 'SquaVeV_S', 'CorrVeV_S',
+        'SimpAjV_S', 'SquaAjV_S', 'CorrAjV_S', 'SimpAvV_S', 'SquaAvV_S', 'CorrAvV_S',
+        'to_AAKuW_C', 'as_AAKuW_C', 'at_AAKuW_C', 'to_AAKuL_C', 'as_AAKuL_C', 'at_AAKuL_C',
+        'to_AABiL_C', 'as_AABiL_C', 'at_AABiL_C', 'to_AABrL_C', 'as_AABrL_C', 'at_AABrL_C',
+        'to_AACoL_C', 'as_AACoL_C', 'at_AACoL_C', 'to_SbFrQ_C', 'as_SbFrQ_C', 'at_SbFrQ_C',
+        'to_SbCDC_C', 'as_SbCDC_C', 'at_SbCDC_C', 'to_SbFrL_C', 'as_SbFrL_C', 'at_SbFrL_C',
+        'to_SbCDL_C', 'as_SbCDL_C', 'at_SbCDL_C', 'to_SbSBW_C', 'as_SbSBW_C', 'at_SbSBW_C',
+        'to_SbL1W_C', 'as_SbL1W_C', 'at_SbL1W_C', 'to_SbSBC_C', 'as_SbSBC_C', 'at_SbSBC_C',
+        'to_SbL1C_C', 'as_SbL1C_C', 'at_SbL1C_C', 'TokSenM_S', 'TokSenS_S', 'TokSenL_S',
+        'as_Token_C', 'as_Sylla_C', 'at_Sylla_C', 'as_Chara_C', 'at_Chara_C', 'FleschG_S',
+        'AutoRea_S', 'ColeLia_S', 'SmogInd_S', 'Gunning_S', 'LinseaW_S'
+        ]
+lingfeat_subtypes = [
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Knowledge Feats",
+"Entity Density Feats",
+"Entity Density Feats",
+"Entity Density Feats",
+"Entity Density Feats",
+"Entity Density Feats",
+"Entity Density Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Entity Grid Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Phrasal Feats",
+"Tree Structure Feats",
+"Tree Structure Feats",
+"Tree Structure Feats",
+"Tree Structure Feats",
+"Tree Structure Feats",
+"Tree Structure Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"POS Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"Variation Ratio Feats",
+"TTR Feats",
+"TTR Feats",
+"TTR Feats",
+"TTR Feats",
+"TTR Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Psycholinguistic Feats",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Word Familiarity",
+"Shallow Feats",
+"Shallow Feats",
+"Shallow Feats",
+"Shallow Feats",
+"Shallow Feats",
+"Shallow Feats",
+"Shallow Feats",
+"Shallow Feats",
+"Traditional Formulas",
+"Traditional Formulas",
+"Traditional Formulas",
+"Traditional Formulas",
+"Traditional Formulas",
+"Traditional Formulas",
+]
+lingfeat_types = [
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"AdSem",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Disco",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"Synta",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"LxSem",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+"ShaTr",
+]
+lf_names = """| 1     | AdSem  | WoKF_         | Wiki Knowledge Features             | WRich05_S    | Semantic Richness, 50 topics extracted from Wikipedia                          |
+| 2     | AdSem  | WoKF_         | Wiki Knowledge Features             | WClar05_S    | Semantic Clarity, 50 topics extracted from Wikipedia                           |
+| 3     | AdSem  | WoKF_         | Wiki Knowledge Features             | WNois05_S    | Semantic Noise, 50 topics extracted from Wikipedia                             |
+| 4     | AdSem  | WoKF_         | Wiki Knowledge Features             | WTopc05_S    | Number of topics, 50 topics extracted from Wikipedia                           |
+| 5     | AdSem  | WoKF_         | Wiki Knowledge Features             | WRich10_S    | Semantic Richness, 100 topics extracted from Wikipedia                         |
+| 6     | AdSem  | WoKF_         | Wiki Knowledge Features             | WClar10_S    | Semantic Clarity, 100 topics extracted from Wikipedia                          |
+| 7     | AdSem  | WoKF_         | Wiki Knowledge Features             | WNois10_S    | Semantic Noise, 100 topics extracted from Wikipedia                            |
+| 8     | AdSem  | WoKF_         | Wiki Knowledge Features             | WTopc10_S    | Number of topics, 100 topics extracted from Wikipedia                          |
+| 9     | AdSem  | WoKF_         | Wiki Knowledge Features             | WRich15_S    | Semantic Richness, 150 topics extracted from Wikipedia                         |
+| 10    | AdSem  | WoKF_         | Wiki Knowledge Features             | WClar15_S    | Semantic Clarity, 150 topics extracted from Wikipedia                          |
+| 11    | AdSem  | WoKF_         | Wiki Knowledge Features             | WNois15_S    | Semantic Noise, 150 topics extracted from Wikipedia                            |
+| 12    | AdSem  | WoKF_         | Wiki Knowledge Features             | WTopc15_S    | Number of topics, 150 topics extracted from Wikipedia                          |
+| 13    | AdSem  | WoKF_         | Wiki Knowledge Features             | WRich20_S    | Semantic Richness, 200 topics extracted from Wikipedia                         |
+| 14    | AdSem  | WoKF_         | Wiki Knowledge Features             | WClar20_S    | Semantic Clarity, 200 topics extracted from Wikipedia                          |
+| 15    | AdSem  | WoKF_         | Wiki Knowledge Features             | WNois20_S    | Semantic Noise, 200 topics extracted from Wikipedia                            |
+| 16    | AdSem  | WoKF_         | Wiki Knowledge Features             | WTopc20_S    | Number of topics, 200 topics extracted from Wikipedia                          |
+| 17    | AdSem  | WBKF_         | WB Knowledge Features     | BRich05_S    | Semantic Richness, 50 topics extracted from WeeBit Corpus                  |
+| 18    | AdSem  | WBKF_         | WB Knowledge Features     | BClar05_S    | Semantic Clarity, 50 topics extracted from WeeBit Corpus                       |
+| 19    | AdSem  | WBKF_         | WB Knowledge Features     | BNois05_S    | Semantic Noise, 50 topics extracted from WeeBit Corpus                         |
+| 20    | AdSem  | WBKF_         | WB Knowledge Features     | BTopc05_S    | Number of topics, 50 topics extracted from WeeBit Corpus                       |
+| 21    | AdSem  | WBKF_         | WB Knowledge Features     | BRich10_S    | Semantic Richness, 100 topics extracted from WeeBit Corpus                 |
+| 22    | AdSem  | WBKF_         | WB Knowledge Features     | BClar10_S    | Semantic Clarity, 100 topics extracted from WeeBit Corpus                      |
+| 23    | AdSem  | WBKF_         | WB Knowledge Features     | BNois10_S    | Semantic Noise, 100 topics extracted from WeeBit Corpus                        |
+| 24    | AdSem  | WBKF_         | WB Knowledge Features     | BTopc10_S    | Number of topics, 100 topics extracted from WeeBit Corpus                      |
+| 25    | AdSem  | WBKF_         | WB Knowledge Features     | BRich15_S    | Semantic Richness, 150 topics extracted from WeeBit Corpus                 |
+| 26    | AdSem  | WBKF_         | WB Knowledge Features     | BClar15_S    | Semantic Clarity, 150 topics extracted from WeeBit Corpus                      |
+| 27    | AdSem  | WBKF_         | WB Knowledge Features     | BNois15_S    | Semantic Noise, 150 topics extracted from WeeBit Corpus                        |
+| 28    | AdSem  | WBKF_         | WB Knowledge Features     | BTopc15_S    | Number of topics, 150 topics extracted from WeeBit Corpus                      |
+| 29    | AdSem  | WBKF_         | WB Knowledge Features     | BRich20_S    | Semantic Richness, 200 topics extracted from WeeBit Corpus                 |
+| 30    | AdSem  | WBKF_         | WB Knowledge Features     | BClar20_S    | Semantic Clarity, 200 topics extracted from WeeBit Corpus                      |
+| 31    | AdSem  | WBKF_         | WB Knowledge Features     | BNois20_S    | Semantic Noise, 200 topics extracted from WeeBit Corpus                        |
+| 32    | AdSem  | WBKF_         | WB Knowledge Features     | BTopc20_S    | Number of topics, 200 topics extracted from WeeBit Corpus                      |
+| 33    | AdSem  | OSKF_         | OSE Knowledge Features | ORich05_S    | Semantic Richness, 50 topics extracted from OneStopEng Corpus              |
+| 34    | AdSem  | OSKF_         | OSE Knowledge Features | OClar05_S    | Semantic Clarity, 50 topics extracted from OneStopEng Corpus                   |
+| 35    | AdSem  | OSKF_         | OSE Knowledge Features | ONois05_S    | Semantic Noise, 50 topics extracted from OneStopEng Corpus                     |
+| 36    | AdSem  | OSKF_         | OSE Knowledge Features | OTopc05_S    | Number of topics, 50 topics extracted from OneStopEng Corpus                   |
+| 37    | AdSem  | OSKF_         | OSE Knowledge Features | ORich10_S    | Semantic Richness, 100 topics extracted from OneStopEng Corpus             |
+| 38    | AdSem  | OSKF_         | OSE Knowledge Features | OClar10_S    | Semantic Clarity, 100 topics extracted from OneStopEng Corpus                  |
+| 39    | AdSem  | OSKF_         | OSE Knowledge Features | ONois10_S    | Semantic Noise, 100 topics extracted from OneStopEng Corpus                    |
+| 40    | AdSem  | OSKF_         | OSE Knowledge Features | OTopc10_S    | Number of topics, 100 topics extracted from OneStopEng Corpus                  |
+| 41    | AdSem  | OSKF_         | OSE Knowledge Features | ORich15_S    | Semantic Richness, 150 topics extracted from OneStopEng Corpus             |
+| 42    | AdSem  | OSKF_         | OSE Knowledge Features | OClar15_S    | Semantic Clarity, 150 topics extracted from OneStopEng Corpus                  |
+| 43    | AdSem  | OSKF_         | OSE Knowledge Features | ONois15_S    | Semantic Noise, 150 topics extracted from OneStopEng Corpus                    |
+| 44    | AdSem  | OSKF_         | OSE Knowledge Features | OTopc15_S    | Number of topics, 150 topics extracted from OneStopEng Corpus                  |
+| 45    | AdSem  | OSKF_         | OSE Knowledge Features | ORich20_S    | Semantic Richness, 200 topics extracted from OneStopEng Corpus             |
+| 46    | AdSem  | OSKF_         | OSE Knowledge Features | OClar20_S    | Semantic Clarity, 200 topics extracted from OneStopEng Corpus                  |
+| 47    | AdSem  | OSKF_         | OSE Knowledge Features | ONois20_S    | Semantic Noise, 200 topics extracted from OneStopEng Corpus                    |
+| 48    | AdSem  | OSKF_         | OSE Knowledge Features | OTopc20_S    | Number of topics, 200 topics extracted from OneStopEng Corpus                  |
+| 49    | Disco           | EnDF_         | Entity Density Features              | to_EntiM_C   | total number of Entities Mentions counts                                       |
+| 50    | Disco           | EnDF_         | Entity Density Features              | as_EntiM_C   | average number of Entities Mentions counts per sentence                        |
+| 51    | Disco           | EnDF_         | Entity Density Features              | at_EntiM_C   | average number of Entities Mentions counts per token (word)                    |
+| 52    | Disco           | EnDF_         | Entity Density Features              | to_UEnti_C   | total number of unique Entities                                                |
+| 53    | Disco           | EnDF_         | Entity Density Features              | as_UEnti_C   | average number of unique Entities per sentence                                 |
+| 54    | Disco           | EnDF_         | Entity Density Features              | at_UEnti_C   | average number of unique Entities per token (word)                             |
+| 55    | Disco           | EnGF_         | Entity Grid Features                 | ra_SSTo_C   | ratio of ss transitions to total                                               |
+| 56    | Disco           | EnGF_         | Entity Grid Features                 | ra_SOTo_C   | ratio of so transitions to total                                               |
+| 57    | Disco           | EnGF_         | Entity Grid Features                 | ra_SXTo_C   | ratio of sx transitions to total                                               |
+| 58    | Disco           | EnGF_         | Entity Grid Features                 | ra_SNTo_C   | ratio of sn transitions to total                                               |
+| 59    | Disco           | EnGF_         | Entity Grid Features                 | ra_OSTo_C   | ratio of os transitions to total                                               |
+| 60    | Disco           | EnGF_         | Entity Grid Features                 | ra_OOTo_C   | ratio of oo transitions to total                                               |
+| 61    | Disco           | EnGF_         | Entity Grid Features                 | ra_OXTo_C   | ratio of ox transitions to total                                               |
+| 62    | Disco           | EnGF_         | Entity Grid Features                 | ra_ONTo_C   | ratio of on transitions to total                                               |
+| 63    | Disco           | EnGF_         | Entity Grid Features                 | ra_XSTo_C   | ratio of xs transitions to total                                               |
+| 64    | Disco           | EnGF_         | Entity Grid Features                 | ra_XOTo_C   | ratio of xo transitions to total                                               |
+| 65    | Disco           | EnGF_         | Entity Grid Features                 | ra_XXTo_C   | ratio of xx transitions to total                                               |
+| 66    | Disco           | EnGF_         | Entity Grid Features                 | ra_XNTo_C   | ratio of xn transitions to total                                               |
+| 67    | Disco           | EnGF_         | Entity Grid Features                 | ra_NSTo_C   | ratio of ns transitions to total                                               |
+| 68    | Disco           | EnGF_         | Entity Grid Features                 | ra_NOTo_C   | ratio of no transitions to total                                               |
+| 69    | Disco           | EnGF_         | Entity Grid Features                 | ra_NXTo_C   | ratio of nx transitions to total                                               |
+| 70    | Disco           | EnGF_         | Entity Grid Features                 | ra_NNTo_C   | ratio of nn transitions to total                                               |
+| 71    | Disco           | EnGF_         | Entity Grid Features                 | LoCohPA_S    | Local Coherence for PA score                                                   |
+| 72    | Disco           | EnGF_         | Entity Grid Features                 | LoCohPW_S    | Local Coherence for PW score                                                   |
+| 73    | Disco           | EnGF_         | Entity Grid Features                 | LoCohPU_S    | Local Coherence for PU score                                                   |
+| 74    | Disco           | EnGF_         | Entity Grid Features                 | LoCoDPA_S    | Local Coherence distance for PA score                                          |
+| 75    | Disco           | EnGF_         | Entity Grid Features                 | LoCoDPW_S    | Local Coherence distance for PW score                                          |
+| 76    | Disco           | EnGF_         | Entity Grid Features                 | LoCoDPU_S    | Local Coherence distance for PU score                                          |
+| 77    | Synta           | PhrF_         | Phrasal Features                     | to_NoPhr_C   | total count of Noun phrases                                                    |
+| 78    | Synta           | PhrF_         | Phrasal Features                     | as_NoPhr_C   | average count of Noun phrases per sentence                                     |
+| 79    | Synta           | PhrF_         | Phrasal Features                     | at_NoPhr_C   | average count of Noun phrases per token                                        |
+| 80    | Synta           | PhrF_         | Phrasal Features                     | ra_NoVeP_C   | ratio of Noun phrases count to Verb phrases count                              |
+| 81    | Synta           | PhrF_         | Phrasal Features                     | ra_NoSuP_C   | ratio of Noun phrases count to Subordinate Clauses count                       |
+| 82    | Synta           | PhrF_         | Phrasal Features                     | ra_NoPrP_C   | ratio of Noun phrases count to Prep phrases count                              |
+| 83    | Synta           | PhrF_         | Phrasal Features                     | ra_NoAjP_C   | ratio of Noun phrases count to Adj phrases count                               |
+| 84    | Synta           | PhrF_         | Phrasal Features                     | ra_NoAvP_C   | ratio of Noun phrases count to Adv phrases count                               |
+| 85    | Synta           | PhrF_         | Phrasal Features                     | to_VePhr_C   | total count of Verb phrases                                                    |
+| 86    | Synta           | PhrF_         | Phrasal Features                     | as_VePhr_C   | average count of Verb phrases per sentence                                     |
+| 87    | Synta           | PhrF_         | Phrasal Features                     | at_VePhr_C   | average count of Verb phrases per token                                        |
+| 88    | Synta           | PhrF_         | Phrasal Features                     | ra_VeNoP_C   | ratio of Verb phrases count to Noun phrases count                              |
+| 89    | Synta           | PhrF_         | Phrasal Features                     | ra_VeSuP_C   | ratio of Verb phrases count to Subordinate Clauses count                       |
+| 90    | Synta           | PhrF_         | Phrasal Features                     | ra_VePrP_C   | ratio of Verb phrases count to Prep phrases count                              |
+| 91    | Synta           | PhrF_         | Phrasal Features                     | ra_VeAjP_C   | ratio of Verb phrases count to Adj phrases count                               |
+| 92    | Synta           | PhrF_         | Phrasal Features                     | ra_VeAvP_C   | ratio of Verb phrases count to Adv phrases count                               |
+| 93    | Synta           | PhrF_         | Phrasal Features                     | to_SuPhr_C   | total count of Subordinate Clauses                                             |
+| 94    | Synta           | PhrF_         | Phrasal Features                     | as_SuPhr_C   | average count of Subordinate Clauses per sentence                              |
+| 95    | Synta           | PhrF_         | Phrasal Features                     | at_SuPhr_C   | average count of Subordinate Clauses per token                                 |
+| 96    | Synta           | PhrF_         | Phrasal Features                     | ra_SuNoP_C   | ratio of Subordinate Clauses count to Noun phrases count                       |
+| 97    | Synta           | PhrF_         | Phrasal Features                     | ra_SuVeP_C   | ratio of Subordinate Clauses count to Verb phrases count                       |
+| 98    | Synta           | PhrF_         | Phrasal Features                     | ra_SuPrP_C   | ratio of Subordinate Clauses count to Prep phrases count                       |
+| 99    | Synta           | PhrF_         | Phrasal Features                     | ra_SuAjP_C   | ratio of Subordinate Clauses count to Adj phrases count                        |
+| 100   | Synta           | PhrF_         | Phrasal Features                     | ra_SuAvP_C   | ratio of Subordinate Clauses count to Adv phrases count                        |
+| 101   | Synta           | PhrF_         | Phrasal Features                     | to_PrPhr_C   | total count of prepositional phrases                                           |
+| 102   | Synta           | PhrF_         | Phrasal Features                     | as_PrPhr_C   | average count of prepositional phrases per sentence                            |
+| 103   | Synta           | PhrF_         | Phrasal Features                     | at_PrPhr_C   | average count of prepositional phrases per token                               |
+| 104   | Synta           | PhrF_         | Phrasal Features                     | ra_PrNoP_C   | ratio of Prep phrases count to Noun phrases count                              |
+| 105   | Synta           | PhrF_         | Phrasal Features                     | ra_PrVeP_C   | ratio of Prep phrases count to Verb phrases count                              |
+| 106   | Synta           | PhrF_         | Phrasal Features                     | ra_PrSuP_C   | ratio of Prep phrases count to Subordinate Clauses count                       |
+| 107   | Synta           | PhrF_         | Phrasal Features                     | ra_PrAjP_C   | ratio of Prep phrases count to Adj phrases count                               |
+| 108   | Synta           | PhrF_         | Phrasal Features                     | ra_PrAvP_C   | ratio of Prep phrases count to Adv phrases count                               |
+| 109   | Synta           | PhrF_         | Phrasal Features                     | to_AjPhr_C   | total count of Adjective phrases                                               |
+| 110   | Synta           | PhrF_         | Phrasal Features                     | as_AjPhr_C   | average count of Adjective phrases per sentence                                |
+| 111   | Synta           | PhrF_         | Phrasal Features                     | at_AjPhr_C   | average count of Adjective phrases per token                                   |
+| 112   | Synta           | PhrF_         | Phrasal Features                     | ra_AjNoP_C   | ratio of Adj phrases count to Noun phrases count                               |
+| 113   | Synta           | PhrF_         | Phrasal Features                     | ra_AjVeP_C   | ratio of Adj phrases count to Verb phrases count                               |
+| 114   | Synta           | PhrF_         | Phrasal Features                     | ra_AjSuP_C   | ratio of Adj phrases count to Subordinate Clauses count                        |
+| 115   | Synta           | PhrF_         | Phrasal Features                     | ra_AjPrP_C   | ratio of Adj phrases count to Prep phrases count                               |
+| 116   | Synta           | PhrF_         | Phrasal Features                     | ra_AjAvP_C   | ratio of Adj phrases count to Adv phrases count                                |
+| 117   | Synta           | PhrF_         | Phrasal Features                     | to_AvPhr_C   | total count of Adverb phrases                                                  |
+| 118   | Synta           | PhrF_         | Phrasal Features                     | as_AvPhr_C   | average count of Adverb phrases per sentence                                   |
+| 119   | Synta           | PhrF_         | Phrasal Features                     | at_AvPhr_C   | average count of Adverb phrases per token                                      |
+| 120   | Synta           | PhrF_         | Phrasal Features                     | ra_AvNoP_C   | ratio of Adv phrases count to Noun phrases count                               |
+| 121   | Synta           | PhrF_         | Phrasal Features                     | ra_AvVeP_C   | ratio of Adv phrases count to Verb phrases count                               |
+| 122   | Synta           | PhrF_         | Phrasal Features                     | ra_AvSuP_C   | ratio of Adv phrases count to Subordinate Clauses count                        |
+| 123   | Synta           | PhrF_         | Phrasal Features                     | ra_AvPrP_C   | ratio of Adv phrases count to Prep phrases count                               |
+| 124   | Synta           | PhrF_         | Phrasal Features                     | ra_AvAjP_C   | ratio of Adv phrases count to Adj phrases count                                |
+| 125   | Synta           | TrSF_         | Tree Structure Features              | to_TreeH_C   | total Tree height of all sentences                                             |
+| 126   | Synta           | TrSF_         | Tree Structure Features              | as_TreeH_C   | average Tree height per sentence                                               |
+| 127   | Synta           | TrSF_         | Tree Structure Features              | at_TreeH_C   | average Tree height per token (word)                                           |
+| 128   | Synta           | TrSF_         | Tree Structure Features              | to_FTree_C   | total length of flattened Trees                                                |
+| 129   | Synta           | TrSF_         | Tree Structure Features              | as_FTree_C   | average length of flattened Trees per sentence                                 |
+| 130   | Synta           | TrSF_         | Tree Structure Features              | at_FTree_C   | average length of flattened Trees per token (word)                             |
+| 131   | Synta           | POSF_         | Part-of-Speech Features              | to_NoTag_C   | total count of Noun POS tags                                                   |
+| 132   | Synta           | POSF_         | Part-of-Speech Features              | as_NoTag_C   | average count of Noun POS tags per sentence                                    |
+| 133   | Synta           | POSF_         | Part-of-Speech Features              | at_NoTag_C   | average count of Noun POS tags per token                                       |
+| 134   | Synta           | POSF_         | Part-of-Speech Features              | ra_NoAjT_C   | ratio of Noun POS count to Adjective POS count                                 |
+| 135   | Synta           | POSF_         | Part-of-Speech Features              | ra_NoVeT_C   | ratio of Noun POS count to Verb POS count                                      |
+| 136   | Synta           | POSF_         | Part-of-Speech Features              | ra_NoAvT_C   | ratio of Noun POS count to Adverb POS count                                    |
+| 137   | Synta           | POSF_         | Part-of-Speech Features              | ra_NoSuT_C   | ratio of Noun POS count to Subordinating Conjunction count                     |
+| 138   | Synta           | POSF_         | Part-of-Speech Features              | ra_NoCoT_C   | ratio of Noun POS count to Coordinating Conjunction count                      |
+| 139   | Synta           | POSF_         | Part-of-Speech Features              | to_VeTag_C   | total count of Verb POS tags                                                   |
+| 140   | Synta           | POSF_         | Part-of-Speech Features              | as_VeTag_C   | average count of Verb POS tags per sentence                                    |
+| 141   | Synta           | POSF_         | Part-of-Speech Features              | at_VeTag_C   | average count of Verb POS tags per token                                       |
+| 142   | Synta           | POSF_         | Part-of-Speech Features              | ra_VeAjT_C   | ratio of Verb POS count to Adjective POS count                                 |
+| 143   | Synta           | POSF_         | Part-of-Speech Features              | ra_VeNoT_C   | ratio of Verb POS count to Noun POS count                                      |
+| 144   | Synta           | POSF_         | Part-of-Speech Features              | ra_VeAvT_C   | ratio of Verb POS count to Adverb POS count                                    |
+| 145   | Synta           | POSF_         | Part-of-Speech Features              | ra_VeSuT_C   | ratio of Verb POS count to Subordinating Conjunction count                     |
+| 146   | Synta           | POSF_         | Part-of-Speech Features              | ra_VeCoT_C   | ratio of Verb POS count to Coordinating Conjunction count                      |
+| 147   | Synta           | POSF_         | Part-of-Speech Features              | to_AjTag_C   | total count of Adjective POS tags                                              |
+| 148   | Synta           | POSF_         | Part-of-Speech Features              | as_AjTag_C   | average count of Adjective POS tags per sentence                               |
+| 149   | Synta           | POSF_         | Part-of-Speech Features              | at_AjTag_C   | average count of Adjective POS tags per token                                  |
+| 150   | Synta           | POSF_         | Part-of-Speech Features              | ra_AjNoT_C   | ratio of Adjective POS count to Noun POS count                                 |
+| 151   | Synta           | POSF_         | Part-of-Speech Features              | ra_AjVeT_C   | ratio of Adjective POS count to Verb POS count                                 |
+| 152   | Synta           | POSF_         | Part-of-Speech Features              | ra_AjAvT_C   | ratio of Adjective POS count to Adverb POS count                               |
+| 153   | Synta           | POSF_         | Part-of-Speech Features              | ra_AjSuT_C   | ratio of Adjective POS count to Subordinating Conjunction count                |
+| 154   | Synta           | POSF_         | Part-of-Speech Features              | ra_AjCoT_C   | ratio of Adjective POS count to Coordinating Conjunction count                 |
+| 155   | Synta           | POSF_         | Part-of-Speech Features              | to_AvTag_C   | total count of Adverb POS tags                                                 |
+| 156   | Synta           | POSF_         | Part-of-Speech Features              | as_AvTag_C   | average count of Adverb POS tags per sentence                                  |
+| 157   | Synta           | POSF_         | Part-of-Speech Features              | at_AvTag_C   | average count of Adverb POS tags per token                                     |
+| 158   | Synta           | POSF_         | Part-of-Speech Features              | ra_AvAjT_C   | ratio of Adverb POS count to Adjective POS count                               |
+| 159   | Synta           | POSF_         | Part-of-Speech Features              | ra_AvNoT_C   | ratio of Adverb POS count to Noun POS count                                    |
+| 160   | Synta           | POSF_         | Part-of-Speech Features              | ra_AvVeT_C   | ratio of Adverb POS count to Verb POS count                                    |
+| 161   | Synta           | POSF_         | Part-of-Speech Features              | ra_AvSuT_C   | ratio of Adverb POS count to Subordinating Conjunction count                   |
+| 162   | Synta           | POSF_         | Part-of-Speech Features              | ra_AvCoT_C   | ratio of Adverb POS count to Coordinating Conjunction count                    |
+| 163   | Synta           | POSF_         | Part-of-Speech Features              | to_SuTag_C   | total count of Subordinating Conjunction POS tags                              |
+| 164   | Synta           | POSF_         | Part-of-Speech Features              | as_SuTag_C   | average count of Subordinating Conjunction POS tags per sentence               |
+| 165   | Synta           | POSF_         | Part-of-Speech Features              | at_SuTag_C   | average count of Subordinating Conjunction POS tags per token                  |
+| 166   | Synta           | POSF_         | Part-of-Speech Features              | ra_SuAjT_C   | ratio of Subordinating Conjunction POS count to Adjective POS count            |
+| 167   | Synta           | POSF_         | Part-of-Speech Features              | ra_SuNoT_C   | ratio of Subordinating Conjunction POS count to Noun POS count                 |
+| 168   | Synta           | POSF_         | Part-of-Speech Features              | ra_SuVeT_C   | ratio of Subordinating Conjunction POS count to Verb POS count                 |
+| 169   | Synta           | POSF_         | Part-of-Speech Features              | ra_SuAvT_C   | ratio of Subordinating Conjunction POS count to Adverb POS count               |
+| 170   | Synta           | POSF_         | Part-of-Speech Features              | ra_SuCoT_C   | ratio of Subordinating Conjunction POS count to Coordinating Conjunction count |
+| 171   | Synta           | POSF_         | Part-of-Speech Features              | to_CoTag_C   | total count of Coordinating Conjunction POS tags                               |
+| 172   | Synta           | POSF_         | Part-of-Speech Features              | as_CoTag_C   | average count of Coordinating Conjunction POS tags per sentence                |
+| 173   | Synta           | POSF_         | Part-of-Speech Features              | at_CoTag_C   | average count of Coordinating Conjunction POS tags per token                   |
+| 174   | Synta           | POSF_         | Part-of-Speech Features              | ra_CoAjT_C   | ratio of Coordinating Conjunction POS count to Adjective POS count             |
+| 175   | Synta           | POSF_         | Part-of-Speech Features              | ra_CoNoT_C   | ratio of Coordinating Conjunction POS count to Noun POS count                  |
+| 176   | Synta           | POSF_         | Part-of-Speech Features              | ra_CoVeT_C   | ratio of Coordinating Conjunction POS count to Verb POS count                  |
+| 177   | Synta           | POSF_         | Part-of-Speech Features              | ra_CoAvT_C   | ratio of Coordinating Conjunction POS count to Adverb POS count                |
+| 178   | Synta           | POSF_         | Part-of-Speech Features              | ra_CoSuT_C   | ratio of Coordinating Conjunction POS count to Subordinating Conjunction count |
+| 179   | Synta           | POSF_         | Part-of-Speech Features              | to_ContW_C   | total count of Content words                                                   |
+| 180   | Synta           | POSF_         | Part-of-Speech Features              | as_ContW_C   | average count of Content words per sentence                                    |
+| 181   | Synta           | POSF_         | Part-of-Speech Features              | at_ContW_C   | average count of Content words per token                                       |
+| 182   | Synta           | POSF_         | Part-of-Speech Features              | to_FuncW_C   | total count of Function words                                                  |
+| 183   | Synta           | POSF_         | Part-of-Speech Features              | as_FuncW_C   | average count of Function words per sentence                                   |
+| 184   | Synta           | POSF_         | Part-of-Speech Features              | at_FuncW_C   | average count of Function words per token                                      |
+| 185   | Synta           | POSF_         | Part-of-Speech Features              | ra_CoFuW_C   | ratio of Content words to Function words                                       |
+| 186   | LxSem     | VarF_         | Variation Ratio Features             | SimpNoV_S    | unique Nouns/total Nouns (Noun Variation-1)                                    |
+| 187   | LxSem     | VarF_         | Variation Ratio Features             | SquaNoV_S    | (unique Nouns**2)/total Nouns (Squared Noun Variation-1)                       |
+| 188   | LxSem     | VarF_         | Variation Ratio Features             | CorrNoV_S    | unique Nouns/sqrt(2*total Nouns) (Corrected Noun Variation-1)                  |
+| 189   | LxSem     | VarF_         | Variation Ratio Features             | SimpVeV_S    | unique Verbs/total Verbs (Verb Variation-1)                                    |
+| 190   | LxSem     | VarF_         | Variation Ratio Features             | SquaVeV_S    | (unique Verbs**2)/total Verbs (Squared Verb Variation-1)                       |
+| 191   | LxSem     | VarF_         | Variation Ratio Features             | CorrVeV_S    | unique Verbs/sqrt(2*total Verbs) (Corrected Verb Variation-1)                  |
+| 192   | LxSem     | VarF_         | Variation Ratio Features             | SimpAjV_S    | unique Adjectives/total Adjectives (Adjective Variation-1)                     |
+| 193   | LxSem     | VarF_         | Variation Ratio Features             | SquaAjV_S    | (unique Adjectives**2)/total Adjectives (Squared Adjective Variation-1)        |
+| 194   | LxSem     | VarF_         | Variation Ratio Features             | CorrAjV_S    | unique Adjectives/sqrt(2*total Adjectives) (Corrected Adjective Variation-1)   |
+| 195   | LxSem     | VarF_         | Variation Ratio Features             | SimpAvV_S    | unique Adverbs/total Adverbs (AdVerb Variation-1)                              |
+| 196   | LxSem     | VarF_         | Variation Ratio Features             | SquaAvV_S    | (unique Adverbs**2)/total Adverbs (Squared AdVerb Variation-1)                 |
+| 197   | LxSem     | VarF_         | Variation Ratio Features             | CorrAvV_S    | unique Adverbs/sqrt(2*total Adverbs) (Corrected AdVerb Variation-1)            |
+| 198   | LxSem     | TTRF_         | Type Token Ratio Features            | SimpTTR_S    | unique tokens/total tokens (TTR)                                               |
+| 199   | LxSem     | TTRF_         | Type Token Ratio Features            | CorrTTR_S    | unique tokens/sqrt(2*total tokens) (Corrected TTR)                             |
+| 200   | LxSem     | TTRF_         | Type Token Ratio Features            | BiLoTTR_S    | log(unique tokens)/log(total tokens) (Bi-Logarithmic TTR)                      |
+| 201   | LxSem     | TTRF_         | Type Token Ratio Features            | UberTTR_S    | (log(unique tokens))^2/log(total tokens/unique tokens) (Uber Index)            |
+| 202   | LxSem     | TTRF_         | Type Token Ratio Features            | MTLDTTR_S    | Measure of Textual Lexical Diversity (default TTR = 0.72)                      |
+| 203   | LxSem     | PsyF_         | Psycholinguistic Features            | to_AAKuW_C   | total AoA (Age of Acquisition) of words                                        |
+| 204   | LxSem     | PsyF_         | Psycholinguistic Features            | as_AAKuW_C   | average AoA of words per sentence                                              |
+| 205   | LxSem     | PsyF_         | Psycholinguistic Features            | at_AAKuW_C   | average AoA of words per token                                                 |
+| 206   | LxSem     | PsyF_         | Psycholinguistic Features            | to_AAKuL_C   | total lemmas AoA of lemmas                                                     |
+| 207   | LxSem     | PsyF_         | Psycholinguistic Features            | as_AAKuL_C   | average lemmas AoA of lemmas per sentence                                      |
+| 208   | LxSem     | PsyF_         | Psycholinguistic Features            | at_AAKuL_C   | average lemmas AoA of lemmas per token                                         |
+| 209   | LxSem     | PsyF_         | Psycholinguistic Features            | to_AABiL_C   | total lemmas AoA of lemmas, Bird norm                                          |
+| 210   | LxSem     | PsyF_         | Psycholinguistic Features            | as_AABiL_C   | average lemmas AoA of lemmas, Bird norm per sentence                           |
+| 211   | LxSem     | PsyF_         | Psycholinguistic Features            | at_AABiL_C   | average lemmas AoA of lemmas, Bird norm per token                              |
+| 212   | LxSem     | PsyF_         | Psycholinguistic Features            | to_AABrL_C   | total lemmas AoA of lemmas, Bristol norm                                       |
+| 213   | LxSem     | PsyF_         | Psycholinguistic Features            | as_AABrL_C   | average lemmas AoA of lemmas, Bristol norm per sentence                        |
+| 214   | LxSem     | PsyF_         | Psycholinguistic Features            | at_AABrL_C   | average lemmas AoA of lemmas, Bristol norm per token                           |
+| 215   | LxSem     | PsyF_         | Psycholinguistic Features            | to_AACoL_C   | total AoA of lemmas, Cortese and Khanna norm                                   |
+| 216   | LxSem     | PsyF_         | Psycholinguistic Features            | as_AACoL_C   | average AoA of lemmas, Cortese and Khanna norm per sentence                    |
+| 217   | LxSem     | PsyF_         | Psycholinguistic Features            | at_AACoL_C   | average AoA of lemmas, Cortese and Khanna norm per token                       |
+| 218   | LxSem     | WorF_         | Word Familiarity              | to_SbFrQ_C   | total SubtlexUS FREQcount value                                                |
+| 219   | LxSem     | WorF_         | Word Familiarity              | as_SbFrQ_C   | average SubtlexUS FREQcount value per sentenc                                  |
+| 220   | LxSem     | WorF_         | Word Familiarity              | at_SbFrQ_C   | average SubtlexUS FREQcount value per token                                    |
+| 221   | LxSem     | WorF_         | Word Familiarity              | to_SbCDC_C   | total SubtlexUS CDcount value                                                  |
+| 222   | LxSem     | WorF_         | Word Familiarity              | as_SbCDC_C   | average SubtlexUS CDcount value per sentence                                   |
+| 223   | LxSem     | WorF_         | Word Familiarity              | at_SbCDC_C   | average SubtlexUS CDcount value per token                                      |
+| 224   | LxSem     | WorF_         | Word Familiarity              | to_SbFrL_C   | total SubtlexUS FREQlow value                                                  |
+| 225   | LxSem     | WorF_         | Word Familiarity              | as_SbFrL_C   | average SubtlexUS FREQlow value per sentence                                   |
+| 226   | LxSem     | WorF_         | Word Familiarity              | at_SbFrL_C   | average SubtlexUS FREQlow value per token                                      |
+| 227   | LxSem     | WorF_         | Word Familiarity              | to_SbCDL_C   | total SubtlexUS CDlow value                                                    |
+| 228   | LxSem     | WorF_         | Word Familiarity              | as_SbCDL_C   | average SubtlexUS CDlow value per sentence                                     |
+| 229   | LxSem     | WorF_         | Word Familiarity              | at_SbCDL_C   | average SubtlexUS CDlow value per token                                        |
+| 230   | LxSem     | WorF_         | Word Familiarity              | to_SbSBW_C   | total SubtlexUS SUBTLWF value                                                  |
+| 231   | LxSem     | WorF_         | Word Familiarity              | as_SbSBW_C   | average SubtlexUS SUBTLWF value per sentence                                   |
+| 232   | LxSem     | WorF_         | Word Familiarity              | at_SbSBW_C   | average SubtlexUS SUBTLWF value per token                                      |
+| 233   | LxSem     | WorF_         | Word Familiarity              | to_SbL1W_C   | total SubtlexUS Lg10WF value                                                   |
+| 234   | LxSem     | WorF_         | Word Familiarity              | as_SbL1W_C   | average SubtlexUS Lg10WF value per sentence                                    |
+| 235   | LxSem     | WorF_         | Word Familiarity              | at_SbL1W_C   | average SubtlexUS Lg10WF value per token                                       |
+| 236   | LxSem     | WorF_         | Word Familiarity              | to_SbSBC_C   | total SubtlexUS SUBTLCD value                                                  |
+| 237   | LxSem     | WorF_         | Word Familiarity              | as_SbSBC_C   | average SubtlexUS SUBTLCD value per sentence                                   |
+| 238   | LxSem     | WorF_         | Word Familiarity              | at_SbSBC_C   | average SubtlexUS SUBTLCD value per token                                      |
+| 239   | LxSem     | WorF_         | Word Familiarity              | to_SbL1C_C   | total SubtlexUS Lg10CD value                                                   |
+| 240   | LxSem     | WorF_         | Word Familiarity              | as_SbL1C_C   | average SubtlexUS Lg10CD value per sentence                                    |
+| 241   | LxSem     | WorF_         | Word Familiarity              | at_SbL1C_C   | average SubtlexUS Lg10CD value per token                                       |
+| 242   | ShaTr     | ShaF_         | Shallow Features                     | TokSenM_S   | total count of tokens x total count of sentence                                |
+| 243   | ShaTr     | ShaF_         | Shallow Features                     | TokSenS_S   | sqrt(total count of tokens x total count of sentence)                                |
+| 244   | ShaTr     | ShaF_         | Shallow Features                     | TokSenL_S   | log(total count of tokens)/log(total count of sentence)                           |
+| 245   | ShaTr     | ShaF_         | Shallow Features                     | as_Token_C   | average count of tokens per sentence                                           |
+| 246   | ShaTr     | ShaF_         | Shallow Features                     | as_Sylla_C   | average count of syllables per sentence                                        |
+| 247   | ShaTr     | ShaF_         | Shallow Features                     | at_Sylla_C         | average count of syllables per token                                           |
+| 248   | ShaTr     | ShaF_         | Shallow Features                     | as_Chara_C   | average count of characters per sentence                                       |
+| 249   | ShaTr     | ShaF_         | Shallow Features                     | at_Chara_C   | average count of characters per token                                          |
+| 250   | ShaTr     | TraF_         | Traditional Formulas         | SmogInd_S    | Smog Index                                                                     |
+| 251   | ShaTr     | TraF_         | Traditional Formulas         | ColeLia_S    | Coleman Liau Readability Score                                                 |
+| 252   | ShaTr     | TraF_         | Traditional Formulas         | Gunning_S    | Gunning Fog Count Score                                                                    |
+| 253   | ShaTr     | TraF_         | Traditional Formulas         | AutoRea_S    | New Automated Readability Index                                                    |
+| 254   | ShaTr     | TraF_         | Traditional Formulas         | FleschG_S    | Flesch Kincaid Grade Level                                                           |
+| 255   | ShaTr     | TraF_         | Traditional Formulas         | LinseaW_S    | Linsear Write Formula Score"""
+lsca_names = lca_names + sca_names
+name_map = {lsca_names[i]: full_names[i] for i in range(len(lsca_names))}
+type_map = {lingfeat_names[i]: lingfeat_subtypes[i] for i in range(len(lingfeat_names))}
+type_map.update({n: 'lexical' for n in lca_names})
+type_map.update({n: 'syntax' for n in sca_names})
+# from lingfeat_full_names import lf_names
+lf_names = lf_names.split('\n')
+lf_names = [tuple(x.split('|')[5:7]) for x in lf_names]
+lf_map = {k.strip(): v.strip() for k,v in lf_names}
+name_map.update(lf_map)
+used_indices = [
+        1, 2, 3, 4, 5, 6, 7, 10, 11, 18, 25, 30, 31, 34, 35, 36, 37, 39, 40, 41, 57,
+        63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
+        257, 258, 261, 263, 272, 274
+        ]
+eval_indices = [4,5,6,18,257,272]
+eval_indices = [used_indices.index(idx) for idx in eval_indices]
+lftk_df = pd.read_csv('lftk_ids.csv')
+lftk_types = {row['key']: row['domain'] for i,row in lftk_df.iterrows()}
+type_map.update(lftk_types)
+type_map = {k:\
+        'syntax' if v == 'surface'\
+        else 'lexical' if v == 'lexico-semantics'\
+        else v\
+        for k,v in type_map.items()}
+lftkplus_names = lca_names + sca_names + lftk_names
+lftkplus_names = [lftkplus_names[i] for i in used_indices]
+lftk_map = {k: v for k,v in zip(lftk_names, lftk_full_names)}
+name_map.update(lftk_map)
+rev_name_map = {v: k for k,v in name_map.items()}

demo.py ADDED Viewed

	@@ -0,0 +1,371 @@

+def run_gradio(model, tokenizer, scaler, ling_collection, examples=None, lng_names=None, M=None):
+    import numpy as np
+    import torch
+    from datetime import datetime
+    from compute_lng import compute_lng
+    import gradio as gr
+    m = np.load('assets/m.npy')
+    m = -1/m
+    m[m == -np.inf] = 0
+    m /= 100
+    device = model.backbone.device
+    def visibility(mode):
+        if mode == 0:
+            vis_group = group1
+        elif mode == 1:
+            vis_group = group2
+        elif mode == 2:
+            vis_group = group3
+        output = [gr.update(value=''), gr.update(value='')]
+        for component in components:
+            if component in vis_group:
+                output.append(gr.update(visible=True))
+            else:
+                output.append(gr.update(visible=False))
+        return output
+    def generate(sent1, ling):
+        input_ids = tokenizer.encode(sent1, return_tensors='pt').to(device)
+        ling1 = scaler.transform([ling['Source']])
+        ling2 = scaler.transform([ling['Target']])
+        inputs = {'sentence1_input_ids': input_ids,
+                'sentence1_ling': torch.tensor(ling1).float().to(device),
+                'sentence2_ling': torch.tensor(ling2).float().to(device),
+                'sentence1_attention_mask': torch.ones_like(input_ids)}
+        preds = []
+        with torch.no_grad():
+            pred = model.infer(inputs).cpu().numpy()
+        pred = tokenizer.batch_decode(pred,
+                skip_special_tokens=True)[0]
+        return pred
+    def generate_with_feedbacks(sent1, ling):
+        preds = []
+        eta = 0.1
+        input_ids = tokenizer.encode(sent1, return_tensors='pt').to(device)
+        ling1 = torch.tensor(scaler.transform([ling['Source']])).float().to(device)
+        ling2 = torch.tensor(scaler.transform([ling['Target']])).float().to(device)
+        ling1_embed = model.ling_embed(ling1)
+        ling2_embed = model.ling_embed(ling2)
+        cur_ling = ling1_embed + eta * (ling2_embed - ling1_embed)
+        inputs = {'sentence1_input_ids': input_ids,
+                'sent1_ling_embed': ling1_embed,
+                'sent2_ling_embed': ling2_embed,
+                'sentence1_attention_mask': torch.ones_like(input_ids)}
+        converged = False
+        c = 0
+        while not converged:
+            with torch.no_grad():
+                pred = model.infer(inputs)
+                inputs_pred = inputs.copy()
+                inputs_pred.update({'input_ids': pred,
+                    'attention_mask': torch.ones_like(pred)})
+                ling_pred = model.ling_disc(**inputs_pred)
+                ling_pred_embed = model.ling_embed(ling_pred)
+            if len(interpolations) == 0 or pred != interpolations[-1]:
+                interpolations.append(pred)
+            diff = torch.mean((ling2_embed - ling_pred_embed)**2)
+            scale = torch.norm(cur_ling)/torch.norm(ling2)
+            # print(f'Diff: {diff.item():.3f} / Scale: ({scale.item():.3f})>> {tokenizer.batch_decode(pred.cpu().numpy(), skip_special_tokens=True)[0]}')
+            if diff < 1e-5 or c >= 50:
+                converged = True
+            else:
+                # cur_ling = cur_ling + eta * (ling2_embed - ling_pred_embed)
+                inputs.update({
+                    'sentence1_input_ids': pred,
+                    # 'sent2_ling_embed': ling2_embed,
+                    'sentence1_attention_mask': torch.ones_like(pred)
+                    })
+                c += 1
+        pred = tokenizer.batch_decode(pred.cpu().numpy(),
+                skip_special_tokens=True)[0]
+        return pred
+    def generate_with_feedback(sent1, ling, approx):
+        if sent1 == '':
+            return ['Please input a source text.', '']
+        preds = []
+        interpolations = []
+        input_ids = tokenizer.encode(sent1, return_tensors='pt').to(device)
+        ling1 = torch.tensor(scaler.transform([ling['Source']])).float().to(device)
+        ling2 = torch.tensor(scaler.transform([ling['Target']])).float().to(device)
+        ling1_embed = model.ling_embed(ling1)
+        ling2_embed = model.ling_embed(ling2)
+        inputs = {'sentence1_input_ids': input_ids,
+                'sent1_ling_embed': ling1_embed,
+                'sent2_ling_embed': ling2_embed,
+                'sentence1_attention_mask': torch.ones_like(input_ids)}
+        converged = False
+        c = 0
+        eta = 0.3
+        while not converged:
+            with torch.no_grad():
+                pred = model.infer(inputs)
+                inputs_pred = inputs.copy()
+                inputs_pred.update({'input_ids': pred,
+                    'attention_mask': torch.ones_like(pred)})
+                pred_text = tokenizer.batch_decode(pred.cpu().numpy(),
+                        skip_special_tokens=True)[0]
+                if 'approximate' in approx:
+                    ling_pred = model.ling_disc(**inputs_pred)
+                elif 'exact' in approx:
+                    ling_pred = compute_lng(pred_text)
+                    ling_pred = scaler.transform([ling_pred])[0]
+                    ling_pred = torch.tensor(ling_pred).to(pred.device).float()
+                else:
+                    raise ValueError()
+                ling_pred_embed = model.ling_embed(ling_pred)
+            if len(interpolations) == 0 or pred_text != interpolations[-1]:
+                interpolations.append(pred_text)
+            diff = torch.mean((ling2_embed - ling_pred_embed)**2)
+            # print(f'Diff {diff.item():.3f}>> {tokenizer.batch_decode(pred.cpu().numpy(), skip_special_tokens=True)[0]}')
+            if diff < 10 or c >= 50:
+                converged = True
+            else:
+                ling2_embed = ling2_embed + eta * (ling_pred_embed - ling2_embed)
+                inputs.update({'sent2_ling_embed': ling2_embed})
+                c += 1
+        interpolation = '-- ' + '\n-- '.join(interpolations)
+        return [pred_text, interpolation]
+    def generate_random(sent1, ling, count, approx):
+        preds, interpolations = [], []
+        for c in range(count):
+            idx = np.random.randint(0, len(ling_collection))
+            ling_ex = ling_collection[idx]
+            ling['Target'] = ling_ex
+            pred, interpolation =  generate_with_feedback(sent1, ling, approx)
+            preds.append(pred)
+            interpolations.append(interpolation)
+        return '\n***\n'.join(preds), '\n***\n'.join(interpolations), ling
+    def estimate_gen(sent1, sent2, ling, approx):
+        if 'approximate' in approx:
+            input_ids = tokenizer.encode(sent2, return_tensors='pt').to(device)
+            with torch.no_grad():
+                ling_pred = model.ling_disc(input_ids=input_ids).cpu().numpy()
+            ling_pred = scaler.inverse_transform(ling_pred)[0]
+        elif 'exact' in approx:
+            ling_pred = compute_lng(sent2)
+        else:
+            raise ValueError()
+        ling['Target'] = ling_pred
+        gen = generate_with_feedback(sent1, ling, approx)
+        results = gen + [ling]
+        return results
+    def estimate_tgt(sent2, ling, approx):
+        if 'approximate' in approx:
+            input_ids = tokenizer.encode(sent2, return_tensors='pt').to(device)
+            with torch.no_grad():
+                ling_pred = model.ling_disc(input_ids=input_ids).cpu().numpy()
+            ling_pred = scaler.inverse_transform(ling_pred)[0]
+        elif 'exact' in approx:
+            ling_pred = compute_lng(sent2)
+        else:
+            raise ValueError()
+        ling['Target'] = ling_pred
+        return ling
+    def estimate_src(sent1, ling, approx):
+        if 'approximate' in approx:
+            input_ids = tokenizer.encode(sent1, return_tensors='pt').to(device)
+            with torch.no_grad():
+                ling_pred = model.ling_disc(input_ids=input_ids).cpu().numpy()
+            ling_pred = scaler.inverse_transform(ling_pred)[0]
+        elif 'exact' in approx:
+            ling_pred = compute_lng(sent1)
+        else:
+            raise ValueError()
+        ling['Source'] = ling_pred
+        return ling
+    def rand_target(ling):
+        ling['Target'] = scaler.inverse_transform([np.random.randn(*ling['Target'].shape)])[0]
+        return ling
+    def rand_ex_target(ling):
+        idx = np.random.randint(0, len(examples))
+        ling_ex = examples[idx][1]
+        ling['Target'] = ling_ex['Target']
+        return ling
+    def copy(ling):
+        ling['Target'] = ling['Source']
+        return ling
+    def add_noise(ling):
+        x = scaler.transform([ling['Target']])
+        x += np.random.randn(*ling['Target'].shape)
+        x = scaler.inverse_transform(x)[0]
+        ling['Target'] =  x
+        return ling
+    def add(ling):
+        x = scaler.transform([ling['Target']])
+        x += m
+        x = scaler.inverse_transform(x)[0]
+        ling['Target'] =  x
+        return ling
+    def sub(ling):
+        x = scaler.transform([ling['Target']])
+        x -= m
+        x = scaler.inverse_transform(x)[0]
+        ling['Target'] =  x
+        return ling
+    # title = ''
+    # for i, model in enumerate(models):
+    #     if i > 0:
+    #         title += '\n'
+    #     title += f"model ({i})\n\tUsing VAE = {model.args.ling_vae}\n\tUsing ICA = {model.args.use_ica}\n\tNumber of features = {model.args.lng_dim if not model.args.use_ica else model.args.n_ica}"
+    title = """
+    # LingConv: A System for Controlled Linguistic Conversion
+    ## Description
+    This system is an encoder-decoder model for complexity controlled text generation, guided by 241
+    linguistic complexity indices as key attributes. Given a sentence and a desired level of linguistic
+    complexity, the model can generate diverse paraphrases that maintain consistent meaning, adjusted for
+    different linguistic complexity levels. However, it's important to note that not all index combinations are
+    feasible (such as requesting a sentence of "length" 5 with 10 "unique words"). To ensure high quality
+    outputs, our approach interpolates the embedding of linguistic indices to locate the most closely matched,
+    achievable set of indices for the given target.
+    """
+    guide = """
+    You may use the system in on of the following ways:
+    **Randomized Paraphrase Generation**: Select this option to produce multiple paraphrases with a range
+    of linguistic complexity. You need to provide a source text, specify the number of paraphrases you want,
+    and click "Generate." The linguistic complexity of the paraphrases will be determined randomly.
+    **Complexity-Matched Paraphrasing**: Select this option to generate a paraphrase of the given source
+    sentence that closely mirrors the linguistic complexity of another given sentence. Input your source
+    sentence along with another sentence (which will serve only to extract linguistic indices for the
+    paraphrase generation). Then, click "Generate."
+    **Manual Linguistic Control**: Select this option to manually control the linguistic complexity of the
+    generated text. We provided a set of tools for manual adjustments of the desired linguistic complexity of
+    the target sentence. These tools enable the user to extract linguistic indices from a given sentence,
+    generate a random (yet coherent) set of linguistic indices, and add or remove noise from the indices.
+    These tools are designed for experimental use and require the user to possess linguistic expertise for
+    effective input of linguistic indices. To use these tools, select "Tools to assist in setting linguistic
+    indices." Once indices are entered, click "Generate."
+    Second, you may select to use exact or approximate computation of linguistic indices (used in mode (2) and
+    in quality control of the genration). Approximate computation is significantly faster.
+    Third, you may view the intermediate sentences of the quality control process by selecting the checkbox.
+    Fourth, you may try out some examples by clicking on "Examples...". Examples consist of a source sentences,
+    the indices of the source sentences, and a sample set of target linguistic indices.
+    Please make your choice below.
+    """
+    sent1 = gr.Textbox(label='Source text')
+    ling = gr.Dataframe(value = [[x, 0, 0] for x in lng_names],
+            headers=['Index', 'Source', 'Target'],
+            datatype=['str', 'number', 'number'], visible=False)
+    css = """
+    #guide span.svelte-s1r2yt {font-size: 22px !important;
+                    font-weight: 600 !important}
+    """
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(title)
+        with gr.Accordion("Quick Start Guide", open=False, elem_id='guide'):
+            gr.Markdown(guide)
+        mode = gr.Radio(value='Randomized Paraphrase Generation',
+                label='How would you like to use this system?',
+                type="index",
+                choices=['Randomized Paraphrase Generation',
+                    'Complexity-Matched Paraphrasing', 'Manual Linguistic Control'])
+        approx = gr.Radio(value='Use approximate computation of linguistic indices (faster)',
+                choices=['Use approximate computation of linguistic indices (faster)',
+                    'Use exact computation of linguistic indices'], container=False, show_label=False)
+        control_interpolation = gr.Checkbox(label='View the intermediate sentences in the interpolation of linguistic indices')
+        with gr.Accordion("Examples...", open=False):
+            gr.Examples(examples, [sent1, ling], examples_per_page=4, label=None)
+        with gr.Row():
+            sent1.render()
+            with gr.Column():
+                sent2 = gr.Textbox(label='Generated text')
+        interpolation = gr.Textbox(label='Quality control interpolation', visible=False, lines=5)
+        #####################
+        with gr.Row():
+            generate_random_btn = gr.Button("Generate",
+                    variant='primary', scale=1, visible=True)
+            count = gr.Number(label='Number of generated sentences', value=3, precision=0, scale=1, visible=True)
+        # generate_fb_btn = gr.Button("Generate with auto-adjust (towards pred)")
+        # generate_fb_s_btn = gr.Button("Generate with auto-adjust (moving s)")
+        # add_noise_btn = gr.Button('Add noise to target linguistic indices')
+        #####################
+        with gr.Row():
+            estimate_gen_btn = gr.Button("Generate",
+                    variant='primary',
+                    scale=1, visible=False)
+            sent_ling_gen = gr.Textbox(label='Text to estimate linguistic indices', scale=1, visible=False)
+        #####################
+        generate_btn = gr.Button("Generate", variant='primary', visible=False)
+        with gr.Accordion("Tools to assist in the setting of linguistic indices...", open=False, visible=False) as ling_tools:
+            with gr.Row():
+                estimate_tgt_btn = gr.Button("Estimate linguistic indices of this sentence", visible=False)
+                sent_ling_est = gr.Textbox(label='Text to estimate linguistic indices', scale=2, visible=False)
+            estimate_src_btn = gr.Button("Estimate linguistic indices of source sentence", visible=False)
+            # rand_btn = gr.Button("Random target")
+            rand_ex_btn = gr.Button("Random target", size='lg', visible=False)
+            copy_btn = gr.Button("Copy linguistic indices of source to target", size='sm', visible=False)
+            with gr.Row():
+                add_btn = gr.Button('Add \u03B5 to target linguistic indices', visible=False)
+                sub_btn = gr.Button('Subtract \u03B5 from target linguistic indices', visible=False)
+        ling.render()
+        #####################
+        estimate_src_btn.click(estimate_src, inputs=[sent1, ling, approx], outputs=[ling])
+        estimate_tgt_btn.click(estimate_tgt, inputs=[sent_ling_est, ling, approx], outputs=[ling])
+        # estimate_tgt_btn.click(estimate_tgt, inputs=[sent_ling, ling], outputs=[ling])
+        estimate_gen_btn.click(estimate_gen, inputs=[sent1, sent_ling_gen, ling, approx], outputs=[sent2, interpolation, ling])
+        # rand_btn.click(rand_target, inputs=[ling], outputs=[ling])
+        rand_ex_btn.click(rand_ex_target, inputs=[ling], outputs=[ling])
+        copy_btn.click(copy, inputs=[ling], outputs=[ling])
+        generate_btn.click(generate_with_feedback, inputs=[sent1, ling, approx], outputs=[sent2, interpolation])
+        generate_random_btn.click(generate_random, inputs=[sent1, ling, count, approx],
+                outputs=[sent2, interpolation, ling])
+        # generate_fb_btn.click(generate_with_feedback, inputs=[sent1, ling], outputs=sent2s)
+        # generate_fb_s_btn.click(generate_with_feedbacks, inputs=[sent1, ling], outputs=sent2s)
+        add_btn.click(add, inputs=[ling], outputs=[ling])
+        sub_btn.click(sub, inputs=[ling], outputs=[ling])
+        # add_noise_btn.click(add_noise, inputs=[ling], outputs=[ling])
+        group1 = [generate_random_btn, count]
+        group2 = [estimate_gen_btn, sent_ling_gen]
+        group3 = [generate_btn, estimate_src_btn, estimate_tgt_btn, sent_ling_est, rand_ex_btn, copy_btn, add_btn, sub_btn, ling, ling_tools]
+        components = group1 + group2 + group3
+        mode.change(visibility, inputs=[mode], outputs=[sent2, interpolation] + components)
+        control_interpolation.change(lambda v: gr.update(visible=v), inputs=[control_interpolation],
+                outputs=[interpolation])
+    demo.launch(share=True)

model.py ADDED Viewed

	@@ -0,0 +1,696 @@

+import types
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch import nn
+from transformers import T5ForConditionalGeneration, T5EncoderModel, AutoModel, LogitsProcessor, LogitsProcessorList
+from functools import partial
+from compute_lng import compute_lng
+from undecorate import unwrap
+from types import MethodType
+from utils import *
+from ling_disc import DebertaReplacedTokenizer
+from const import *
+def vae_sample(mu, logvar):
+    std = torch.exp(0.5 * logvar)
+    eps = torch.randn_like(std)
+    return eps * std + mu
+class VAE(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.encoder = nn.Sequential(
+                nn.Linear(args.input_dim, args.hidden_dim),
+                nn.ReLU(),
+                nn.Linear(args.hidden_dim, args.hidden_dim),
+                nn.ReLU(),
+                )
+        self.decoder = nn.Sequential(
+                nn.Linear(args.latent_dim, args.hidden_dim),
+                nn.ReLU(),
+                nn.Linear(args.hidden_dim, args.hidden_dim),
+                nn.ReLU(),
+                nn.Linear(args.hidden_dim, args.input_dim),
+                )
+        self.fc_mu = nn.Linear(args.hidden_dim, args.latent_dim)
+        self.fc_var = nn.Linear(args.hidden_dim, args.latent_dim)
+    def forward(self, x):
+        h = self.encoder(x)
+        mu = self.fc_mu(h)
+        logvar = self.fc_var(h)
+        x = vae_sample(mu, logvar)
+        o = self.decoder(x)
+        return o, (mu, logvar)
+class LingGenerator(nn.Module):
+    def __init__(self, args, hidden_dim=1000):
+        super().__init__()
+        self.gen = T5EncoderModel.from_pretrained('google/flan-t5-small')
+        self.hidden_size = self.gen.config.d_model
+        self.ling_embed = nn.Linear(args.lng_dim, self.hidden_size)
+        # self.gen = nn.Sequential(
+        #         nn.Linear(args.lng_dim, 2*hidden_dim),
+        #         nn.ReLU(),
+        #         nn.BatchNorm1d(2*hidden_dim),
+        #         nn.Linear(2*hidden_dim, 2*hidden_dim),
+        #         nn.ReLU(),
+        #         nn.BatchNorm1d(2*hidden_dim),
+        #         nn.Linear(2*hidden_dim, hidden_dim),
+        #         nn.ReLU(),
+        #         )
+        self.gen_type = args.linggen_type
+        self.gen_input = args.linggen_input
+        if self.gen_type == 'vae':
+            self.gen_mu = nn.Linear(hidden_dim, args.lng_dim)
+            self.gen_logvar = nn.Linear(hidden_dim, args.lng_dim)
+        elif self.gen_type == 'det':
+            self.projection = nn.Linear(self.hidden_size, args.lng_dim)
+    def forward(self, batch):
+        inputs_embeds = self.gen.shared(batch['sentence1_input_ids'])
+        inputs_att_mask = batch['sentence1_attention_mask']
+        bs = inputs_embeds.shape[0]
+        if self.gen_input == 's+l':
+            sent1_ling = self.ling_embed(batch['sentence1_ling'])
+            sent1_ling = sent1_ling.view(bs, 1, -1)
+            inputs_embeds = inputs_embeds + sent1_ling
+        gen = self.gen(inputs_embeds=inputs_embeds,
+                attention_mask=inputs_att_mask).last_hidden_state.mean(1)
+        # gen = self.gen(batch['sentence1_ling'])
+        cache = {}
+        if self.gen_type == 'vae':
+            mu = self.gen_mu(gen)
+            logvar = self.gen_logvar(gen)
+            output = vae_sample(mu, logvar)
+            cache['linggen_mu'] = mu
+            cache['linggen_logvar'] = logvar
+        elif self.gen_type == 'det':
+            output = self.projection(gen)
+        return output, cache
+class LingDisc(nn.Module):
+    def __init__(self,
+                 model_name,
+                 disc_type,
+                 disc_ckpt,
+                 lng_dim=40,
+                 quant_nbins=1,
+                 disc_lng_dim=None,
+                 lng_ids=None,
+                 **kwargs):
+        super().__init__()
+        if disc_type == 't5':
+            self.encoder = T5EncoderModel.from_pretrained(model_name)
+            hidden_dim = self.encoder.config.d_model
+            self.dropout = nn.Dropout(0.2)
+            self.lng_dim = disc_lng_dim if disc_lng_dim else lng_dim
+            self.quant = quant_nbins > 1
+            self.quant = False
+            if self.quant:
+                self.ling_classifier = nn.Linear(hidden_dim, self.lng_dim * quant_nbins)
+            else:
+                self.ling_classifier = nn.Linear(hidden_dim, self.lng_dim)
+            lng_ids = torch.tensor(lng_ids) if lng_ids is not None else None
+            # from const import used_indices
+            # lng_ids = torch.tensor(used_indices)
+            self.register_buffer('lng_ids', lng_ids)
+        elif disc_type == 'deberta':
+            self.encoder= DebertaReplacedTokenizer.from_pretrained(
+                    pretrained_model_name_or_path=disc_ckpt,
+                    tok_model_name = model_name,
+                    problem_type='regression', num_labels=40)
+            self.quant = False
+        self.disc_type = disc_type
+    def forward(self, **batch):
+        if not 'attention_mask' in batch:
+            if 'input_ids' in batch:
+                att_mask = torch.ones_like(batch['input_ids'])
+            else:
+                att_mask = torch.ones_like(batch['logits'])[:,:,0]
+        else:
+            att_mask = batch['attention_mask']
+        if 'input_ids' in batch:
+            enc_output = self.encoder(input_ids=batch['input_ids'],
+                    attention_mask=att_mask)
+        elif 'logits' in batch:
+            logits = batch['logits']
+            scores = F.softmax(logits, dim = -1)
+            onehot = F.one_hot(logits.argmax(-1), num_classes=logits.shape[2]).float().to(logits.device)
+            onehot_ = scores - scores.detach() + onehot
+            embed_layer = self.encoder.get_input_embeddings()
+            if isinstance(embed_layer, nn.Sequential):
+                for i, module in enumerate(embed_layer):
+                    if i == 0:
+                        embeds = torch.matmul(onehot_, module.weight)
+                    else:
+                        embeds = module(embeds)
+            else:
+                embeds =  onehot_ @ embed_layer.weight
+                embeds = torch.matmul(onehot_, embed_layer.weight)
+            enc_output = self.encoder(inputs_embeds=embeds,
+                    attention_mask=att_mask)
+        if self.disc_type == 't5':
+            sent_emb = self.dropout(enc_output.last_hidden_state.mean(1))
+            bs = sent_emb.shape[0]
+            output = self.ling_classifier(sent_emb)
+            if self.quant:
+                output = output.reshape(bs, -1, self.lng_dim)
+            if self.lng_ids is not None:
+                output = torch.index_select(output, 1, self.lng_ids)
+        elif self.disc_type == 'deberta':
+            output = enc_output.logits
+        return output
+class SemEmb(nn.Module):
+    def __init__(self, backbone, sep_token_id):
+        super().__init__()
+        self.backbone = backbone
+        self.sep_token_id = sep_token_id
+        hidden_dim = self.backbone.config.d_model
+        self.projection = nn.Sequential(nn.ReLU(),
+                nn.Dropout(0.2),
+                nn.Linear(hidden_dim, 1))
+    def forward(self, **batch):
+        bs = batch['sentence1_attention_mask'].shape[0]
+        ones = torch.ones((bs, 1), device=batch['sentence1_attention_mask'].device)
+        sep = torch.ones((bs, 1), dtype=torch.long,
+                device=batch['sentence1_attention_mask'].device) * self.sep_token_id
+        att_mask = torch.cat([batch['sentence1_attention_mask'], ones, batch['sentence2_attention_mask']], dim=1)
+        if 'logits' in batch:
+            input_ids = torch.cat([batch['sentence1_input_ids'], sep], dim=1)
+            embeds1 = self.backbone.shared(input_ids)
+            logits = batch['logits']
+            scores = F.softmax(logits, dim = -1)
+            onehot = F.one_hot(logits.argmax(-1), num_classes=logits.shape[2]).float().to(logits.device)
+            onehot_ = scores - scores.detach() + onehot
+            embeds2 =  onehot_ @ self.backbone.shared.weight
+            embeds1_2 = torch.cat([embeds1, embeds2], dim=1)
+            hidden_units = self.backbone(inputs_embeds=embeds1_2,
+                    attention_mask=att_mask).last_hidden_state.mean(1)
+        elif 'sentence2_input_ids' in batch:
+            input_ids = torch.cat([batch['sentence1_input_ids'], sep, batch['sentence2_input_ids']], dim=1)
+            hidden_units = self.backbone(input_ids=input_ids,
+                    attention_mask=att_mask).last_hidden_state.mean(1)
+        probs = self.projection(hidden_units)
+        return probs
+def prepare_inputs_for_generation(
+        combine_method,
+        ling2_only,
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        sent1_ling=None,
+        sent2_ling=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        input_ids = input_ids.clone()
+        decoder_inputs_embeds = self.shared(input_ids)
+        if combine_method == 'decoder_add_first':
+            sent2_ling = torch.cat([sent2_ling,
+                torch.repeat_interleave(torch.zeros_like(sent2_ling), input_ids.shape[1] - 1, dim=1)], dim = 1)
+        if combine_method == 'decoder_concat':
+            if ling2_only:
+                decoder_inputs_embeds = torch.cat([sent2_ling, decoder_inputs_embeds], dim=1)
+            else:
+                decoder_inputs_embeds = torch.cat([sent1_ling, sent2_ling, decoder_inputs_embeds], dim=1)
+        elif combine_method == 'decoder_add'or (past_key_values is None and combine_method == 'decoder_add_first'):
+            if ling2_only:
+                decoder_inputs_embeds = decoder_inputs_embeds + sent2_ling
+            else:
+                decoder_inputs_embeds = decoder_inputs_embeds + sent1_ling + sent2_ling
+        return {
+            "decoder_inputs_embeds": decoder_inputs_embeds,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+class LogitsAdd(LogitsProcessor):
+    def __init__(self, sent2_ling):
+        super().__init__()
+        self.sent2_ling = sent2_ling
+    def __call__(self, input_ids, scores):
+        return scores + self.sent2_ling
+class EncoderDecoderVAE(nn.Module):
+    def __init__(self, args, pad_token_id, sepeos_token_id, vocab_size = 32128):
+        super().__init__()
+        self.backbone = T5ForConditionalGeneration.from_pretrained(args.model_name)
+        self.backbone.prepare_inputs_for_generation = types.MethodType(
+                partial(prepare_inputs_for_generation, args.combine_method, args.ling2_only),
+                self.backbone)
+        self.args = args
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = sepeos_token_id
+        hidden_dim = self.backbone.config.d_model if not 'logits' in args.combine_method else vocab_size
+        if args.combine_method == 'fusion1':
+            self.fusion = nn.Sequential(
+                    nn.Linear(hidden_dim + 2 * args.lng_dim, hidden_dim),
+                    )
+        elif args.combine_method == 'fusion2':
+            self.fusion = nn.Sequential(
+                    nn.Linear(hidden_dim + 2 * args.lng_dim, hidden_dim),
+                    nn.ReLU(),
+                    nn.Linear(hidden_dim, hidden_dim),
+                    )
+        elif 'concat' in args.combine_method or 'add' in args.combine_method:
+            if args.ling_embed_type == 'two-layer':
+                self.ling_embed = nn.Sequential(
+                        nn.Linear(args.lng_dim, args.lng_dim),
+                        nn.ReLU(),
+                        nn.Linear(args.lng_dim, hidden_dim),
+                        )
+            else:
+                self.ling_embed = nn.Linear(args.lng_dim, hidden_dim)
+            self.ling_dropout = nn.Dropout(args.ling_dropout)
+        if args.ling_vae:
+            self.ling_mu = nn.Linear(hidden_dim, hidden_dim)
+            self.ling_logvar = nn.Linear(hidden_dim, hidden_dim)
+            nn.init.xavier_uniform_(self.ling_embed.weight)
+            nn.init.xavier_uniform_(self.ling_mu.weight)
+            nn.init.xavier_uniform_(self.ling_logvar.weight)
+        generate_with_grad = unwrap(self.backbone.generate)
+        self.backbone.generate_with_grad = MethodType(generate_with_grad, self.backbone)
+    def get_fusion_layer(self):
+        if 'fusion' in self.args.combine_method:
+            return self.fusion
+        elif 'concat' in self.args.combine_method or 'add' in self.args.combine_method:
+            return self.ling_embed
+        else:
+            return None
+    def sample(self, mu, logvar):
+        std = torch.exp(0.5 * logvar)
+        return mu + std * torch.randn_like(std)
+    def encode(self, batch):
+        if 'inputs_embeds' in batch:
+            inputs_embeds = batch['inputs_embeds']
+        else:
+            inputs_embeds = self.backbone.shared(batch['sentence1_input_ids'])
+        inputs_att_mask = batch['sentence1_attention_mask']
+        bs = inputs_embeds.shape[0]
+        cache = {}
+        if self.args.combine_method in ('input_concat', 'input_add'):
+            if 'sent1_ling_embed' in batch:
+                sent1_ling = batch['sent1_ling_embed']
+            else:
+                sent1_ling = self.ling_embed(self.ling_dropout(batch['sentence1_ling']))
+            if 'sent2_ling_embed' in batch:
+                sent2_ling = batch['sent2_ling_embed']
+            else:
+                sent2_ling = self.ling_embed(self.ling_dropout(batch['sentence2_ling']))
+            if self.args.ling_vae:
+                sent1_ling = F.leaky_relu(sent1_ling)
+                sent1_mu, sent1_logvar = self.ling_mu(sent1_ling), self.ling_logvar(sent1_ling)
+                sent1_ling = self.sample(sent1_mu, sent1_logvar)
+                sent2_ling = F.leaky_relu(sent2_ling)
+                sent2_mu, sent2_logvar = self.ling_mu(sent2_ling), self.ling_logvar(sent2_ling)
+                sent2_ling = self.sample(sent2_mu, sent2_logvar)
+                cache.update({'sent1_mu': sent1_mu, 'sent1_logvar': sent1_logvar,
+                    'sent2_mu': sent2_mu, 'sent2_logvar': sent2_logvar,
+                    'sent1_ling': sent1_ling, 'sent2_ling': sent2_ling})
+            else:
+                cache.update({'sent1_ling': sent1_ling, 'sent2_ling': sent2_ling})
+            sent1_ling = sent1_ling.view(bs, 1, -1)
+            sent2_ling = sent2_ling.view(bs, 1, -1)
+            if self.args.combine_method == 'input_concat':
+                if self.args.ling2_only:
+                    inputs_embeds = torch.cat([inputs_embeds, sent2_ling], dim=1)
+                    inputs_att_mask = torch.cat([inputs_att_mask,
+                        torch.ones((bs, 1)).to(inputs_embeds.device)], dim=1)
+                else:
+                    inputs_embeds = torch.cat([inputs_embeds, sent1_ling, sent2_ling], dim=1)
+                    inputs_att_mask = torch.cat([inputs_att_mask,
+                        torch.ones((bs, 2)).to(inputs_embeds.device)], dim=1)
+            elif self.args.combine_method == 'input_add':
+                if self.args.ling2_only:
+                    inputs_embeds = inputs_embeds + sent2_ling
+                else:
+                    inputs_embeds = inputs_embeds + sent1_ling + sent2_ling
+        return self.backbone.encoder(inputs_embeds=inputs_embeds,
+                attention_mask=inputs_att_mask), inputs_att_mask, cache
+    def decode(self, batch, enc_output, inputs_att_mask, generate):
+        bs = inputs_att_mask.shape[0]
+        cache = {}
+        if self.args.combine_method in ('embed_concat', 'decoder_concat', 'decoder_add', 'logits_add', 'decoder_add_first'):
+            if 'sent1_ling_embed' in batch:
+                sent1_ling = batch['sent1_ling_embed']
+            elif 'sentence1_ling' in batch:
+                sent1_ling = self.ling_embed(self.ling_dropout(batch['sentence1_ling']))
+            else:
+                sent1_ling = None
+            if 'sent2_ling_embed' in batch:
+                sent2_ling = batch['sent2_ling_embed']
+            else:
+                sent2_ling = self.ling_embed(self.ling_dropout(batch['sentence2_ling']))
+            if self.args.ling_vae:
+                sent1_ling = F.leaky_relu(sent1_ling)
+                sent1_mu, sent1_logvar = self.ling_mu(sent1_ling), self.ling_logvar(sent1_ling)
+                sent1_ling = self.sample(sent1_mu, sent1_logvar)
+                sent2_ling = F.leaky_relu(sent2_ling)
+                sent2_mu, sent2_logvar = self.ling_mu(sent2_ling), self.ling_logvar(sent2_ling)
+                sent2_ling = self.sample(sent2_mu, sent2_logvar)
+                cache.update({'sent1_mu': sent1_mu, 'sent1_logvar': sent1_logvar,
+                    'sent2_mu': sent2_mu, 'sent2_logvar': sent2_logvar,
+                    'sent1_ling': sent1_ling, 'sent2_ling': sent2_ling})
+            else:
+                cache.update({'sent2_ling': sent2_ling})
+                if sent1_ling is not None:
+                    cache.update({'sent1_ling': sent1_ling})
+            if sent1_ling is not None:
+                sent1_ling = sent1_ling.view(bs, 1, -1)
+            sent2_ling = sent2_ling.view(bs, 1, -1)
+            if self.args.combine_method == 'decoder_add_first' and not generate:
+                sent2_ling = torch.cat([sent2_ling,
+                    torch.repeat_interleave(torch.zeros_like(sent2_ling), batch['sentence2_input_ids'].shape[1] - 1, dim=1)], dim = 1)
+        else:
+            sent1_ling, sent2_ling = None, None
+        if self.args.combine_method == 'embed_concat':
+            enc_output.last_hidden_state = torch.cat([enc_output.last_hidden_state,
+                sent1_ling, sent2_ling], dim=1)
+            inputs_att_mask = torch.cat([inputs_att_mask,
+                torch.ones((bs, 2)).to(inputs_att_mask.device)], dim=1)
+        elif 'fusion' in self.args.combine_method:
+            sent1_ling = batch['sentence1_ling'].unsqueeze(1)\
+                    .expand(-1, enc_output.last_hidden_state.shape[1], -1)
+            sent2_ling = batch['sentence2_ling'].unsqueeze(1)\
+                    .expand(-1, enc_output.last_hidden_state.shape[1], -1)
+            if self.args.ling2_only:
+                combined_embedding = torch.cat([enc_output.last_hidden_state, sent2_ling], dim=2)
+            else:
+                combined_embedding = torch.cat([enc_output.last_hidden_state, sent1_ling, sent2_ling], dim=2)
+            enc_output.last_hidden_state = self.fusion(combined_embedding)
+        if generate:
+            if self.args.combine_method == 'logits_add':
+                logits_processor = LogitsProcessorList([LogitsAdd(sent2_ling.view(bs, -1))])
+            else:
+                logits_processor = LogitsProcessorList()
+            dec_output = self.backbone.generate_with_grad(
+                    attention_mask=inputs_att_mask,
+                    encoder_outputs=enc_output,
+                    sent1_ling=sent1_ling,
+                    sent2_ling=sent2_ling,
+                    return_dict_in_generate=True,
+                    output_scores=True,
+                    logits_processor = logits_processor,
+                    # renormalize_logits=True,
+                    # do_sample=True,
+                    # top_p=0.8,
+                    eos_token_id=self.eos_token_id,
+                    # min_new_tokens=3,
+                    # repetition_penalty=1.2,
+                    max_length=self.args.max_length,
+                    )
+            scores = torch.stack(dec_output.scores, 1)
+            cache.update({'scores': scores})
+            return dec_output.sequences, cache
+        decoder_input_ids = self.backbone._shift_right(batch['sentence2_input_ids'])
+        decoder_inputs_embeds = self.backbone.shared(decoder_input_ids)
+        decoder_att_mask = batch['sentence2_attention_mask']
+        labels = batch['sentence2_input_ids'].clone()
+        labels[labels == self.pad_token_id] = -100
+        if self.args.combine_method == 'decoder_concat':
+            if self.args.ling2_only:
+                decoder_inputs_embeds = torch.cat([sent2_ling, decoder_inputs_embeds], dim=1)
+                decoder_att_mask = torch.cat([torch.ones((bs, 1)).to(decoder_inputs_embeds.device), decoder_att_mask], dim=1)
+                labels = torch.cat([torch.ones((bs, 1), dtype=torch.int64).to(decoder_inputs_embeds.device) * self.pad_token_id,
+                    labels], dim=1)
+            else:
+                decoder_inputs_embeds = torch.cat([sent1_ling, sent2_ling, decoder_inputs_embeds], dim=1)
+                decoder_att_mask = torch.cat([torch.ones((bs, 2)).to(decoder_inputs_embeds.device), decoder_att_mask], dim=1)
+                labels = torch.cat([torch.ones((bs, 2), dtype=torch.int64).to(decoder_inputs_embeds.device) * self.pad_token_id,
+                    labels], dim=1)
+        elif self.args.combine_method == 'decoder_add' or self.args.combine_method == 'decoder_add_first' :
+            if self.args.ling2_only:
+                decoder_inputs_embeds = decoder_inputs_embeds + self.args.combine_weight * sent2_ling
+            else:
+                decoder_inputs_embeds = decoder_inputs_embeds + sent1_ling + sent2_ling
+        dec_output = self.backbone(
+                decoder_inputs_embeds=decoder_inputs_embeds,
+                decoder_attention_mask=decoder_att_mask,
+                encoder_outputs=enc_output,
+                attention_mask=inputs_att_mask,
+                labels=labels,
+                )
+        if self.args.combine_method == 'logits_add':
+            dec_output.logits = dec_output.logits + self.args.combine_weight * sent2_ling
+            vocab_size = dec_output.logits.size(-1)
+            dec_output.loss = F.cross_entropy(dec_output.logits.view(-1, vocab_size), labels.view(-1))
+        return dec_output, cache
+    def forward(self, batch, generate=False):
+        enc_output, enc_att_mask, cache = self.encode(batch)
+        dec_output, cache2 = self.decode(batch, enc_output, enc_att_mask, generate)
+        cache.update(cache2)
+        return dec_output, enc_output, cache
+    def infer_with_cache(self, batch):
+        dec_output, _, cache = self(batch, generate = True)
+        return dec_output, cache
+    def infer(self, batch):
+        dec_output, _ = self.infer_with_cache(batch)
+        return dec_output
+    def infer_with_feedback_BP(self, ling_disc, sem_emb, batch, tokenizer, scaler):
+        from torch.autograd import grad
+        interpolations = []
+        def line_search():
+            best_val = None
+            best_loss = None
+            eta = 1e3
+            sem_prob = 1
+            patience = 4
+            while patience > 0:
+                param_ = param - eta * grads
+                with torch.no_grad():
+                    new_loss, pred = get_loss(param_)
+                max_len = pred.shape[1]
+                lens = torch.where(pred == self.eos_token_id, 1, 0).argmax(-1) + 1
+                # if lens.item() == 1:
+                #     patience -= 1
+                batch.update({
+                    'sentence2_input_ids': pred,
+                    'sentence2_attention_mask': sequence_mask(lens, max_len = max_len)
+                    })
+                sem_prob = torch.sigmoid(sem_emb(**batch)).item()
+                # if sem_prob <= 0.1:
+                #     patience -= 1
+                # f.write(f'[{eta}], [{new_loss.item():.2f}], [{sem_prob:.2f}], {tokenizer.decode(pred[0])}\n')
+                # print(f'[{eta}], [{new_loss.item():.2f}], [{sem_prob:.2f}], {tokenizer.decode(pred[0])}\n')
+                if new_loss < loss and sem_prob >= 0.90 and lens.item() > 1:
+                    return param_
+                eta *= 2.25
+                patience -= 1
+            return False
+        def get_loss(param):
+            if self.args.feedback_param == 'l':
+                batch.update({'sent2_ling_embed': param})
+            elif self.args.feedback_param == 's':
+                batch.update({'inputs_embeds': param})
+            if self.args.feedback_param == 'logits':
+                logits = param
+                pred = param.argmax(-1)
+            else:
+                pred, cache = self.infer_with_cache(batch)
+                logits = cache['scores']
+            out = ling_disc(logits = logits)
+            probs = F.softmax(out, 1)
+            if ling_disc.quant:
+                loss = F.cross_entropy(out, batch['sentence2_discr'])
+            else:
+                loss = F.mse_loss(out, batch['sentence2_ling'])
+            return loss, pred
+        if self.args.feedback_param == 'l':
+            ling2_embed = self.ling_embed(batch['sentence2_ling'])
+            param = torch.nn.Parameter(ling2_embed, requires_grad = True)
+        elif self.args.feedback_param == 's':
+            inputs_embeds = self.backbone.shared(batch['sentence1_input_ids'])
+            param = torch.nn.Parameter(inputs_embeds, requires_grad = True)
+        elif self.args.feedback_param == 'logits':
+            logits = self.infer_with_cache(batch)[1]['scores']
+            param = torch.nn.Parameter(logits, requires_grad = True)
+        f = open(self.args.fb_log, 'a') if self.args.fb_log else None
+        target_np = batch['sentence2_ling'][0].cpu().numpy()
+        while True:
+            loss, pred = get_loss(param)
+            pred_text = tokenizer.batch_decode(pred.cpu().numpy(),
+                    skip_special_tokens=True)[0]
+            if f:
+                # from compute_lng import compute_lng
+                # lng_pred = scaler.transform(np.array([compute_lng(pred_text)])[:,used_indices])[0]
+                # real_loss = np.mean((lng_pred - target_np)**2)
+                # f.write(f'Loss: {loss.item():.2f}\tReal loss:{real_loss:.2f}\t{pred_text}\n')
+                f.write(f'*** [{loss.item():.2f}], {pred_text}\n')
+            interpolations.append(pred_text)
+            if loss < 1:
+                break
+            self.zero_grad()
+            grads = grad(loss, param)[0]
+            param = line_search()
+            if param is False:
+                break
+        if f:
+            f.write(f'[return] {pred_text}\n\n')
+            f.close()
+        return pred, [pred_text, interpolations]
+    def infer_with_feedback(self, ling_disc, batch, tokenizer, scaler, approx=False):
+        interpolations = []
+        converged = False
+        c = 0
+        eta = 0.3
+        use_embed = True
+        if use_embed:
+            ling1_embed = self.ling_embed(batch['sentence1_ling'])
+            ling2_embed = self.ling_embed(batch['sentence2_ling'])
+            batch.update({
+                    'sent1_ling_embed': ling1_embed,
+                    'sent2_ling_embed': ling2_embed,
+                    })
+        else:
+            ling2 = batch['sentence2_ling']
+        ling2_orig = batch['sentence2_ling'].clone()
+        while not converged:
+            with torch.no_grad():
+                pred = self.infer(batch)
+                inputs_pred = batch.copy()
+                inputs_pred.update({'input_ids': pred,
+                    'attention_mask': torch.ones_like(pred)})
+                pred_text = tokenizer.batch_decode(pred.cpu().numpy(),
+                        skip_special_tokens=True)[0]
+                if approx:
+                    ling_pred = ling_disc(**inputs_pred)
+                else:
+                    ling_pred = compute_lng(pred_text)
+                    ling_pred = scaler.transform([ling_pred])[0]
+                    ling_pred = torch.tensor(ling_pred).to(pred.device).float()
+                if use_embed:
+                    ling_pred_embed = self.ling_embed(ling_pred)
+                    # diff = torch.mean((ling2_embed - ling_pred_embed)**2)
+                # else:
+                diff = torch.mean((ling2_orig - ling_pred)**2)
+            # print(f'Diff {diff.item():.3f}>> {tokenizer.batch_decode(pred.cpu().numpy(), skip_special_tokens=True)[0]}')
+            if diff < 1e-1 or c == 6:
+                converged = True
+            elif use_embed:
+                ling2_embed = ling2_embed + eta * (ling_pred_embed - ling2_embed)
+                batch.update({'sent2_ling_embed': ling2_embed})
+            else:
+                ling2 = ling2 + eta * (ling_pred - ling2)
+                batch.update({'sentence2_ling': ling2})
+            c += 1
+            if len(interpolations) == 0 or pred_text != interpolations[-1]:
+                interpolations.append(pred_text)
+        return [pred_text, interpolations]
+def set_grad(module, state):
+    if module is not None:
+        for p in module.parameters():
+            p.requires_grad = state
+def set_grad_except(model, name, state):
+    for n, p in model.named_parameters():
+        if not name in n:
+            p.requires_grad = state
+class SemEmbPipeline():
+    def __init__(self,
+            ckpt = "/data/mohamed/checkpoints/ling_conversion_sem_emb_best.pt"):
+        self.tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
+        self.model = SemEmb(T5EncoderModel.from_pretrained('google/flan-t5-base'), self.tokenizer.get_vocab()['</s>'])
+        state = torch.load(ckpt)
+        self.model.load_state_dict(state['model'], strict=False)
+        self.model.eval()
+        self.model.cuda()
+    def __call__(self, sentence1, sentence2):
+        sentence1 = self.tokenizer(sentence1, return_attention_mask = True, return_tensors = 'pt')
+        sentence2 = self.tokenizer(sentence2, return_attention_mask = True, return_tensors = 'pt')
+        sem_logit = self.model(
+                sentence1_input_ids = sentence1.input_ids.cuda(),
+                sentence1_attention_mask = sentence1.attention_mask.cuda(),
+                sentence2_input_ids = sentence2.input_ids.cuda(),
+                sentence2_attention_mask = sentence2.attention_mask.cuda(),
+                )
+        sem_prob = torch.sigmoid(sem_logit).item()
+        return sem_prob
+class LingDiscPipeline():
+    def __init__(self,
+                 model_name="google/flan-t5-base",
+                 disc_type='deberta',
+                 disc_ckpt='/data/mohamed/checkpoints/ling_disc/deberta-v3-small_flan-t5-base_40',
+                 # disc_type='t5',
+                 # disc_ckpt='/data/mohamed/checkpoints/ling_conversion_ling_disc.pt',
+                 ):
+        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
+        self.model = LingDisc(model_name, disc_type, disc_ckpt)
+        self.model.eval()
+        self.model.cuda()
+    def __call__(self, sentence):
+        inputs = self.tokenizer(sentence, return_tensors = 'pt')
+        with torch.no_grad():
+            ling_pred = self.model(input_ids=inputs.input_ids.cuda())
+        return ling_pred

options.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import argparse
+from datetime import datetime
+from const import lca_names, sca_names, lingfeat_names
+import os, json
+from copy import deepcopy
+import numpy as np
+def parse_args(ckpt=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', default='/data/mohamed/data')
+    parser.add_argument('--data', default='ling_conversion')
+    parser.add_argument('--data_sources')
+    parser.add_argument('--data_type', default='text')
+    parser.add_argument('--aim_repo', default='/data/mohamed/')
+    parser.add_argument('--ckpt_dir', default='/data/mohamed/checkpoints')
+    parser.add_argument('--kld_annealing', default='cyclic')
+    parser.add_argument('--lingpred_annealing', default='mono')
+    parser.add_argument('--ling_embed_type', default = 'one-layer')
+    parser.add_argument('--combine_weight', default=1, type=float)
+    parser.add_argument('--alpha_kld', default=1, type=float)
+    parser.add_argument('--alpha_lingpred', default=1, type=float)
+    parser.add_argument('--alpha_sem', default=1, type=float)
+    parser.add_argument('--max_grad_norm', default=10, type=float)
+    parser.add_argument('--sem_loss_tao', default=0.5, type=float)
+    parser.add_argument('--sem_loss_eps', default=1, type=float)
+    parser.add_argument('--ckpt')
+    parser.add_argument('--disc_ckpt')
+    parser.add_argument('--sem_ckpt')
+    parser.add_argument('--lng_ids')
+    parser.add_argument('--lng_ids_idx', type=int)
+    parser.add_argument('--lng_ids_path', default='/data/mohamed/indices')
+    parser.add_argument('--preds_dir', default='/data/mohamed/preds')
+    parser.add_argument('--model_name', default="google/flan-t5-base")
+    parser.add_argument('--disc_type', default="t5")
+    parser.add_argument('--aim_exp', default='ling-conversion')
+    parser.add_argument('--sem_loss_type', default='dedicated')
+    parser.add_argument('--combine_method', default='none')
+    parser.add_argument('--train_log', type=int, default=200)
+    parser.add_argument('--val_log', type=int, default=2000)
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--eval_batch_size', type=int, default=32)
+    parser.add_argument('--max_eval_samples', type=int, default=1000)
+    parser.add_argument('--test_batch_size', type=int, default=1)
+    parser.add_argument('--hidden_dim', type=int, default=500)
+    parser.add_argument('--latent_dim', type=int, default=150)
+    parser.add_argument('--lng_dim', type=int, default=40)
+    parser.add_argument('--disc_lng_dim', type=int)
+    parser.add_argument('--use_lora', action='store_true')
+    parser.add_argument('--lora_r', type=int, default=64)
+    parser.add_argument('--gpu', type=str, default='0')
+    parser.add_argument('--epochs', type=int, default=10)
+    parser.add_argument('--grad_accumulation', type=int, default=1)
+    parser.add_argument('--n_ica', type=int, default=10)
+    parser.add_argument('--max_length', type=int, default=200)
+    parser.add_argument('--total_steps', type=int)
+    parser.add_argument('--kld_const', type=float, default=1)
+    parser.add_argument('--lr', type=float, default=1e-4)
+    parser.add_argument('--kl_weight', type=float, default=1e-1)
+    parser.add_argument('--weight_decay', type=float, default=1e-2)
+    parser.add_argument('--ling_dropout', type=float, default=0.1)
+    parser.add_argument('--predict_fn', default = 'logs/test.txt')
+    parser.add_argument('--save_predict', action='store_true')
+    parser.add_argument('--use_ica', action='store_true')
+    parser.add_argument('--pretrain_gen', action='store_true')
+    parser.add_argument('--pretrain_sem', action='store_true')
+    parser.add_argument('--pretrain_disc', action='store_true')
+    parser.add_argument('--linggen_type', default='none')
+    parser.add_argument('--linggen_input', default='s+l')
+    parser.add_argument('--aug_same', action='store_true')
+    parser.add_argument('--ling_vae', action='store_true')
+    parser.add_argument('--process_lingpred', action='store_true')
+    parser.add_argument('--fudge_lambda', type=float, default=1.0)
+    parser.add_argument('--use_lingpred', action='store_true')
+    parser.add_argument('--ling2_only', action='store_true')
+    parser.add_argument('--cycle_loss', action='store_true')
+    parser.add_argument('--disc_loss', action='store_true')
+    parser.add_argument('--sem_loss', action='store_true')
+    parser.add_argument('--sim_loss', action='store_true')
+    parser.add_argument('--optuna', action='store_true')
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--demo', action='store_true')
+    parser.add_argument('--fudge', action='store_true')
+    parser.add_argument('--fb_log', default='feedback_logs/default.txt')
+    parser.add_argument('--eval_only', action='store_true')
+    parser.add_argument('--predict_with_feedback', action='store_true')
+    parser.add_argument('--feedback_param', default = 's')
+    parser.add_argument('--eval_ling', action='store_true')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--major_arg', default = 0, type=int)
+    parser.add_argument('--quantize_lng', action='store_true')
+    parser.add_argument('--quant_nbins', type=int, default=20)
+    parser.add_argument('--src_lng', default = 'ling')
+    parser.add_argument('--to_restore', nargs='+', default=[])
+    # args = parser.parse_args()
+    args, unknown = parser.parse_known_args()
+    args.name = f'{datetime.now().strftime("%m%d_%H-%M-%S")}-{args.data}-{args.combine_method}'
+    major_arg = args.major_arg
+    to_restore = [
+            'total_steps','major_arg','gpu','demo', 'eval_only', 'save_predict', 'predict_fn', 'fudge', 'predict_with_feedback',
+            'feedback_param', 'fb_log', 'data_dir', 'data', 'disc_ckpt', 'disc_type', 'sem_ckpt', 'fudge_lambda', 'test_batch_size', 'src_lng'
+            ] + args.to_restore
+    to_restore = {k: args.__dict__[k] for k in to_restore}
+    if not args.disc_loss or args.disc_ckpt:
+        args.disc_steps = 0
+    if args.data_sources is not None:
+        args.data_sources = args.data_sources.split(',')
+    if ckpt is not None:
+        args.ckpt = ckpt
+    args_list = [args]
+    if args.ckpt:
+        if ',' in args.ckpt:
+            ckpts = args.ckpt.split(',')
+            args_list = [deepcopy(args) for _ in range(len(ckpts))]
+            for i in range(len(ckpts)):
+                args_path = ckpts[i].replace('_best', '').replace('.pt', '.json')
+                with open(args_path) as f:
+                    args_list[i].__dict__.update(json.load(f))
+                args_list[i].__dict__.update(to_restore)
+                args_list[i].ckpt = ckpts[i]
+        else:
+            args_path = args.ckpt.replace('_best', '').replace('.pt', '.json')
+            ckpt = args.ckpt
+            with open(args_path) as f:
+                args.__dict__.update(json.load(f))
+                args.__dict__.update(to_restore)
+                args.ckpt = ckpt
+    lng_names = lca_names + sca_names + lingfeat_names
+    for i in range(len(args_list)):
+        if args_list[i].lng_ids or args_list[i].lng_ids_idx:
+            if args_list[i].lng_ids_idx:
+                lng_ids = np.load(os.path.join(args_list[i].lng_ids_path, f'{args_list[i].lng_ids_idx}.npy'))
+            elif args_list[i].lng_ids[0].isnumeric():
+                lng_ids = [int(x) for x in args_list[i].lng_ids.split(',')]
+            elif ',' in args_list[i].lng_ids:
+                lng_ids = [lng_names.index(x) for x in args_list[i].lng_ids.split(',')]
+            else:
+                lng_ids = np.load(args_list[i].lng_ids)
+            args_list[i].lng_dim = len(lng_ids)
+            args_list[i].lng_ids = lng_ids.tolist()
+            # lng_names = [lng_names[i] for i in lng_ids]
+        elif args_list[i].use_ica:
+            args_list[i].lng_dim = args_list[i].n_ica
+        if args_list[i].disc_lng_dim is None:
+            args_list[i].disc_lng_dim = args_list[i].lng_dim
+    if not args.ckpt and not args.eval_only:
+        args_path = os.path.join(args.ckpt_dir, '%s.json'%args.name)
+        with open(args_path, 'w') as f:
+            s = json.dumps(args.__dict__)
+            f.write(s)
+    return args_list[major_arg], args_list, lng_names

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+numpy
+joblib
+gensim
+supar
+transformers
+scikit-learn
+tqdm
+spacy
+sentencepiece