{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idannotatorpositionquestiontable_fileanswer_coordinatesanswer_text
0ms-is-0100What is the abbreviation of the module Informa...felix_playground_SQA_Training/MS_IS_all_module...['(0, 1)']['12-IV-161-m01']
1ms-is-0100What is the ID of IT-Management?felix_playground_SQA_Training/MS_IS_all_module...['(1, 1)']['12-M-ITM-161-m01']
2ms-is-0100What is the abbreviation of Project Seminar?felix_playground_SQA_Training/MS_IS_all_module...['(2, 1)']['12-PS-192-m01']
3ms-is-0100What is the code for Information Retrieval?felix_playground_SQA_Training/MS_IS_all_module...['(3, 1)']['10-I=IR-161-m01']
4ms-is-0100What is the abbreviation of the module Analysi...felix_playground_SQA_Training/MS_IS_all_module...['(4, 1)']['10-I=PA-161-m01']
\n", "
" ], "text/plain": [ " id annotator position \\\n", "0 ms-is-01 0 0 \n", "1 ms-is-01 0 0 \n", "2 ms-is-01 0 0 \n", "3 ms-is-01 0 0 \n", "4 ms-is-01 0 0 \n", "\n", " question \\\n", "0 What is the abbreviation of the module Informa... \n", "1 What is the ID of IT-Management? \n", "2 What is the abbreviation of Project Seminar? \n", "3 What is the code for Information Retrieval? \n", "4 What is the abbreviation of the module Analysi... \n", "\n", " table_file answer_coordinates \\\n", "0 felix_playground_SQA_Training/MS_IS_all_module... ['(0, 1)'] \n", "1 felix_playground_SQA_Training/MS_IS_all_module... ['(1, 1)'] \n", "2 felix_playground_SQA_Training/MS_IS_all_module... ['(2, 1)'] \n", "3 felix_playground_SQA_Training/MS_IS_all_module... ['(3, 1)'] \n", "4 felix_playground_SQA_Training/MS_IS_all_module... ['(4, 1)'] \n", "\n", " answer_text \n", "0 ['12-IV-161-m01'] \n", "1 ['12-M-ITM-161-m01'] \n", "2 ['12-PS-192-m01'] \n", "3 ['10-I=IR-161-m01'] \n", "4 ['10-I=PA-161-m01'] " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "sqa_data = pd.read_excel(\"module_guide_sq_abbreviation.xlsx\")\n", "# sqa_data = sqa_data.astype(str)\n", "sqa_data.head()\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idannotatorpositionquestiontable_fileanswer_coordinatesanswer_text
0ms-is-0100What is the abbreviation of the module Informa...felix_playground_SQA_Training/MS_IS_all_module...[(0, 1)][12-IV-161-m01]
1ms-is-0100What is the ID of IT-Management?felix_playground_SQA_Training/MS_IS_all_module...[(1, 1)][12-M-ITM-161-m01]
2ms-is-0100What is the abbreviation of Project Seminar?felix_playground_SQA_Training/MS_IS_all_module...[(2, 1)][12-PS-192-m01]
3ms-is-0100What is the code for Information Retrieval?felix_playground_SQA_Training/MS_IS_all_module...[(3, 1)][10-I=IR-161-m01]
4ms-is-0100What is the abbreviation of the module Analysi...felix_playground_SQA_Training/MS_IS_all_module...[(4, 1)][10-I=PA-161-m01]
5ms-is-0100What is the code for Security of Software Syst...felix_playground_SQA_Training/MS_IS_all_module...[(5, 1)][10-I=SSS-172-m01]
6ms-is-0100What is the ID of Software Architecture?felix_playground_SQA_Training/MS_IS_all_module...[(6, 1)][10-I=SAR-161-m01]
7ms-is-0100What is the abbreviation of the module Artific...felix_playground_SQA_Training/MS_IS_all_module...[(7, 1)][10-I=KI1-161-m01]
8ms-is-0100What is the abbreviation of Discrete Event Sim...felix_playground_SQA_Training/MS_IS_all_module...[(8, 1)][10-I=ST-161-m01]
9ms-is-0100What is the code for Advanced Programming?felix_playground_SQA_Training/MS_IS_all_module...[(9, 1)][10-I=APR-182-m01]
\n", "
" ], "text/plain": [ " id annotator position \\\n", "0 ms-is-01 0 0 \n", "1 ms-is-01 0 0 \n", "2 ms-is-01 0 0 \n", "3 ms-is-01 0 0 \n", "4 ms-is-01 0 0 \n", "5 ms-is-01 0 0 \n", "6 ms-is-01 0 0 \n", "7 ms-is-01 0 0 \n", "8 ms-is-01 0 0 \n", "9 ms-is-01 0 0 \n", "\n", " question \\\n", "0 What is the abbreviation of the module Informa... \n", "1 What is the ID of IT-Management? \n", "2 What is the abbreviation of Project Seminar? \n", "3 What is the code for Information Retrieval? \n", "4 What is the abbreviation of the module Analysi... \n", "5 What is the code for Security of Software Syst... \n", "6 What is the ID of Software Architecture? \n", "7 What is the abbreviation of the module Artific... \n", "8 What is the abbreviation of Discrete Event Sim... \n", "9 What is the code for Advanced Programming? \n", "\n", " table_file answer_coordinates \\\n", "0 felix_playground_SQA_Training/MS_IS_all_module... [(0, 1)] \n", "1 felix_playground_SQA_Training/MS_IS_all_module... [(1, 1)] \n", "2 felix_playground_SQA_Training/MS_IS_all_module... [(2, 1)] \n", "3 felix_playground_SQA_Training/MS_IS_all_module... [(3, 1)] \n", "4 felix_playground_SQA_Training/MS_IS_all_module... [(4, 1)] \n", "5 felix_playground_SQA_Training/MS_IS_all_module... [(5, 1)] \n", "6 felix_playground_SQA_Training/MS_IS_all_module... [(6, 1)] \n", "7 felix_playground_SQA_Training/MS_IS_all_module... [(7, 1)] \n", "8 felix_playground_SQA_Training/MS_IS_all_module... [(8, 1)] \n", "9 felix_playground_SQA_Training/MS_IS_all_module... [(9, 1)] \n", "\n", " answer_text \n", "0 [12-IV-161-m01] \n", "1 [12-M-ITM-161-m01] \n", "2 [12-PS-192-m01] \n", "3 [10-I=IR-161-m01] \n", "4 [10-I=PA-161-m01] \n", "5 [10-I=SSS-172-m01] \n", "6 [10-I=SAR-161-m01] \n", "7 [10-I=KI1-161-m01] \n", "8 [10-I=ST-161-m01] \n", "9 [10-I=APR-182-m01] " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import ast\n", "\n", "def _parse_answer_coordinates(answer_coordinate_str):\n", " \"\"\"Parses the answer_coordinates of a question.\n", " Args:\n", " answer_coordinate_str: A string representation of a Python list of tuple\n", " strings.\n", " For example: \"['(1, 4)','(1, 3)', ...]\"\n", " \"\"\"\n", "\n", " try:\n", " answer_coordinates = []\n", " # make a list of strings\n", " coords = ast.literal_eval(answer_coordinate_str)\n", " # parse each string as a tuple\n", " for row_index, column_index in sorted(\n", " ast.literal_eval(coord) for coord in coords):\n", " answer_coordinates.append((row_index, column_index))\n", " except SyntaxError:\n", " raise ValueError('Unable to evaluate %s' % answer_coordinate_str)\n", " \n", " return answer_coordinates\n", "\n", "\n", "def _parse_answer_text(answer_text):\n", " \"\"\"Populates the answer_texts field of `answer` by parsing `answer_text`.\n", " Args:\n", " answer_text: A string representation of a Python list of strings.\n", " For example: \"[u'test', u'hello', ...]\"\n", " answer: an Answer object.\n", " \"\"\"\n", " try:\n", " answer = []\n", " for value in ast.literal_eval(answer_text):\n", " answer.append(value)\n", " except SyntaxError:\n", " raise ValueError('Unable to evaluate %s' % answer_text)\n", "\n", " return answer\n", "\n", "sqa_data['answer_coordinates'] = sqa_data['answer_coordinates'].apply(lambda coords_str: _parse_answer_coordinates(coords_str))\n", "sqa_data['answer_text'] = sqa_data['answer_text'].apply(lambda txt: _parse_answer_text(txt))\n", "\n", "sqa_data.head(10)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idannotatorpositionquestiontable_fileanswer_coordinatesanswer_textsequence_id
0ms-is-0100What is the abbreviation of the module Informa...felix_playground_SQA_Training/MS_IS_all_module...[(0, 1)][12-IV-161-m01]ms-is-01-0
1ms-is-0100What is the ID of IT-Management?felix_playground_SQA_Training/MS_IS_all_module...[(1, 1)][12-M-ITM-161-m01]ms-is-01-0
2ms-is-0100What is the abbreviation of Project Seminar?felix_playground_SQA_Training/MS_IS_all_module...[(2, 1)][12-PS-192-m01]ms-is-01-0
3ms-is-0100What is the code for Information Retrieval?felix_playground_SQA_Training/MS_IS_all_module...[(3, 1)][10-I=IR-161-m01]ms-is-01-0
4ms-is-0100What is the abbreviation of the module Analysi...felix_playground_SQA_Training/MS_IS_all_module...[(4, 1)][10-I=PA-161-m01]ms-is-01-0
\n", "
" ], "text/plain": [ " id annotator position \\\n", "0 ms-is-01 0 0 \n", "1 ms-is-01 0 0 \n", "2 ms-is-01 0 0 \n", "3 ms-is-01 0 0 \n", "4 ms-is-01 0 0 \n", "\n", " question \\\n", "0 What is the abbreviation of the module Informa... \n", "1 What is the ID of IT-Management? \n", "2 What is the abbreviation of Project Seminar? \n", "3 What is the code for Information Retrieval? \n", "4 What is the abbreviation of the module Analysi... \n", "\n", " table_file answer_coordinates \\\n", "0 felix_playground_SQA_Training/MS_IS_all_module... [(0, 1)] \n", "1 felix_playground_SQA_Training/MS_IS_all_module... [(1, 1)] \n", "2 felix_playground_SQA_Training/MS_IS_all_module... [(2, 1)] \n", "3 felix_playground_SQA_Training/MS_IS_all_module... [(3, 1)] \n", "4 felix_playground_SQA_Training/MS_IS_all_module... [(4, 1)] \n", "\n", " answer_text sequence_id \n", "0 [12-IV-161-m01] ms-is-01-0 \n", "1 [12-M-ITM-161-m01] ms-is-01-0 \n", "2 [12-PS-192-m01] ms-is-01-0 \n", "3 [10-I=IR-161-m01] ms-is-01-0 \n", "4 [10-I=PA-161-m01] ms-is-01-0 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_sequence_id(example_id, annotator):\n", " if \"-\" in str(annotator):\n", " raise ValueError('\"-\" not allowed in annotator.')\n", " return f\"{example_id}-{annotator}\"\n", "\n", "sqa_data['sequence_id'] = sqa_data.apply(lambda x: get_sequence_id(x.id, x.annotator), axis=1)\n", "sqa_data.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questiontable_fileanswer_coordinatesanswer_text
sequence_id
ms-is-01-0[What is the abbreviation of the module Inform...felix_playground_SQA_Training/MS_IS_all_module...[[(0, 1)], [(1, 1)], [(2, 1)], [(3, 1)], [(4, ...[[12-IV-161-m01], [12-M-ITM-161-m01], [12-PS-1...
\n", "
" ], "text/plain": [ " question \\\n", "sequence_id \n", "ms-is-01-0 [What is the abbreviation of the module Inform... \n", "\n", " table_file \\\n", "sequence_id \n", "ms-is-01-0 felix_playground_SQA_Training/MS_IS_all_module... \n", "\n", " answer_coordinates \\\n", "sequence_id \n", "ms-is-01-0 [[(0, 1)], [(1, 1)], [(2, 1)], [(3, 1)], [(4, ... \n", "\n", " answer_text \n", "sequence_id \n", "ms-is-01-0 [[12-IV-161-m01], [12-M-ITM-161-m01], [12-PS-1... " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# let's group table-question pairs by sequence id, and remove some columns we don't need \n", "grouped = sqa_data.groupby(by='sequence_id').agg(lambda x: x.tolist())\n", "grouped = grouped.drop(columns=['id', 'annotator', 'position'])\n", "grouped['table_file'] = grouped['table_file'].apply(lambda x: x[0])\n", "grouped.head(10)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Module titleAbbreviationModule coordinatorModule offered byETCSMethod of gradingDurationModule levelContentsIntended learning outcomesCoursesMethod of assessmentAllocation of placesAdditional informationWorkloadTeaching cycleReferred to in LPO I
0Information Processing within Organizations12-IV-161-m01Faculty of Business Management and Economicsholder of the Chair of Business Management and...5numerical grade1 semestergraduateContent:This course provides students with an ...After completing the course \"Integrated Inform...V (2) + Ü (2)written examination (approx. 60 minutes)Langua...----150 h----
1IT-Management12-M-ITM-161-m01Faculty of Business Management and EconomicsHolder of the Chair of Information Systems Eng...5numerical grade1 semestergraduateContent:This course provides students with an ...After completing the course \"IT Management\", s...V (2) + Ü (2)a) written examination (approx. 60 minutes) or...----150 h----
2Project Seminar12-PS-192-m01Faculty of Business Management and EconomicsHolder of the Chair of Business Management and...15numerical grade1 semestergraduateContent:In small project teams of 4 to 10 memb...After completing the course \"Projektseminar\", ...S (2)project: preparing a conceptual design (approx...----450 h----
3Information Retrieval10-I=IR-161-m01Institute of Computer ScienceDean of Studies Informatik (Computer Science)5numerical grade1 semestergraduateIR models (e. g. Boolean and vector space mode...The students possess theoretical and practical...V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
4Analysis and Design of Programs10-I=PA-161-m01Institute of Computer Scienceholder of the Chair of Computer Science II5numerical grade1 semestergraduateProgram analysis, model creation in software e...The students are able to analyse programs, to ...V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
......................................................
115Vertical Storytelling12-M-VS-221-m01Faculty of Business Management and Economicsnan10numerical grade1 semesternan----S (2)portfolio (approx. 5 pages)Assessment offered:...----300 h----
116Organizational Economics and Digital Transform...12-M-OEDT-231-m01Faculty of Business Management and Economicsnan5numerical grade1 semesternan----V (2) + Ü (2)Module taught in: Englisha) written examination (approx. 60 minutes) or...----150 h----
117Policy Evaluation Methods12-M-PEM-182-m01Faculty of Business Management and EconomicsHolder of the Chair of Labor Economics5numerical grade1 semestergraduateThis course offers an introduction to the fund...At the end of the course, students should be a...V (2) + Ü (2)Module taught in: Englisha) written examination (approx. 60 minutes) or...--Research track module in Masters programme IEP150 h----
118Topics in Empirical Economics12-M-TE-231-m01Faculty of Business Management and Economicsnan5numerical grade1 semesternan----V (2) + Ü (2)Module taught in: Englishportfolio (approx. 50 hours)Prüfungssprache: E...12 *WA1(1) Should the number of applications e...--150 h----
119Master Thesis Information Systems12-WI-MA-192-m01Faculty of Business Management and EconomicsDean of the Faculty of Business Management and...30numerical grade1 semestergraduateStudents will complete their degree with a Mas...In the master thesis students prove that they ...--Masters thesis (approx. 60 to 80 pages)Languag...--Time to complete: 6 months900 h----
\n", "

120 rows × 17 columns

\n", "
" ], "text/plain": [ " Module title Abbreviation \\\n", "0 Information Processing within Organizations 12-IV-161-m01 \n", "1 IT-Management 12-M-ITM-161-m01 \n", "2 Project Seminar 12-PS-192-m01 \n", "3 Information Retrieval 10-I=IR-161-m01 \n", "4 Analysis and Design of Programs 10-I=PA-161-m01 \n", ".. ... ... \n", "115 Vertical Storytelling 12-M-VS-221-m01 \n", "116 Organizational Economics and Digital Transform... 12-M-OEDT-231-m01 \n", "117 Policy Evaluation Methods 12-M-PEM-182-m01 \n", "118 Topics in Empirical Economics 12-M-TE-231-m01 \n", "119 Master Thesis Information Systems 12-WI-MA-192-m01 \n", "\n", " Module coordinator \\\n", "0 Faculty of Business Management and Economics \n", "1 Faculty of Business Management and Economics \n", "2 Faculty of Business Management and Economics \n", "3 Institute of Computer Science \n", "4 Institute of Computer Science \n", ".. ... \n", "115 Faculty of Business Management and Economics \n", "116 Faculty of Business Management and Economics \n", "117 Faculty of Business Management and Economics \n", "118 Faculty of Business Management and Economics \n", "119 Faculty of Business Management and Economics \n", "\n", " Module offered by ETCS Method of grading \\\n", "0 holder of the Chair of Business Management and... 5 numerical grade \n", "1 Holder of the Chair of Information Systems Eng... 5 numerical grade \n", "2 Holder of the Chair of Business Management and... 15 numerical grade \n", "3 Dean of Studies Informatik (Computer Science) 5 numerical grade \n", "4 holder of the Chair of Computer Science II 5 numerical grade \n", ".. ... ... ... \n", "115 nan 10 numerical grade \n", "116 nan 5 numerical grade \n", "117 Holder of the Chair of Labor Economics 5 numerical grade \n", "118 nan 5 numerical grade \n", "119 Dean of the Faculty of Business Management and... 30 numerical grade \n", "\n", " Duration Module level \\\n", "0 1 semester graduate \n", "1 1 semester graduate \n", "2 1 semester graduate \n", "3 1 semester graduate \n", "4 1 semester graduate \n", ".. ... ... \n", "115 1 semester nan \n", "116 1 semester nan \n", "117 1 semester graduate \n", "118 1 semester nan \n", "119 1 semester graduate \n", "\n", " Contents \\\n", "0 Content:This course provides students with an ... \n", "1 Content:This course provides students with an ... \n", "2 Content:In small project teams of 4 to 10 memb... \n", "3 IR models (e. g. Boolean and vector space mode... \n", "4 Program analysis, model creation in software e... \n", ".. ... \n", "115 -- \n", "116 -- \n", "117 This course offers an introduction to the fund... \n", "118 -- \n", "119 Students will complete their degree with a Mas... \n", "\n", " Intended learning outcomes \\\n", "0 After completing the course \"Integrated Inform... \n", "1 After completing the course \"IT Management\", s... \n", "2 After completing the course \"Projektseminar\", ... \n", "3 The students possess theoretical and practical... \n", "4 The students are able to analyse programs, to ... \n", ".. ... \n", "115 -- \n", "116 -- \n", "117 At the end of the course, students should be a... \n", "118 -- \n", "119 In the master thesis students prove that they ... \n", "\n", " Courses \\\n", "0 V (2) + Ü (2) \n", "1 V (2) + Ü (2) \n", "2 S (2) \n", "3 V (2) + Ü (2) \n", "4 V (2) + Ü (2) \n", ".. ... \n", "115 S (2) \n", "116 V (2) + Ü (2)Module taught in: English \n", "117 V (2) + Ü (2)Module taught in: English \n", "118 V (2) + Ü (2)Module taught in: English \n", "119 -- \n", "\n", " Method of assessment \\\n", "0 written examination (approx. 60 minutes)Langua... \n", "1 a) written examination (approx. 60 minutes) or... \n", "2 project: preparing a conceptual design (approx... \n", "3 written examination (approx. 60 to 120 minutes... \n", "4 written examination (approx. 60 to 120 minutes... \n", ".. ... \n", "115 portfolio (approx. 5 pages)Assessment offered:... \n", "116 a) written examination (approx. 60 minutes) or... \n", "117 a) written examination (approx. 60 minutes) or... \n", "118 portfolio (approx. 50 hours)Prüfungssprache: E... \n", "119 Masters thesis (approx. 60 to 80 pages)Languag... \n", "\n", " Allocation of places \\\n", "0 -- \n", "1 -- \n", "2 -- \n", "3 -- \n", "4 -- \n", ".. ... \n", "115 -- \n", "116 -- \n", "117 -- \n", "118 12 *WA1(1) Should the number of applications e... \n", "119 -- \n", "\n", " Additional information Workload \\\n", "0 -- 150 h \n", "1 -- 150 h \n", "2 -- 450 h \n", "3 Focuses available for students of the Masters ... 150 h \n", "4 Focuses available for students of the Masters ... 150 h \n", ".. ... ... \n", "115 -- 300 h \n", "116 -- 150 h \n", "117 Research track module in Masters programme IEP 150 h \n", "118 -- 150 h \n", "119 Time to complete: 6 months 900 h \n", "\n", " Teaching cycle Referred to in LPO I \n", "0 -- -- \n", "1 -- -- \n", "2 -- -- \n", "3 -- -- \n", "4 -- -- \n", ".. ... ... \n", "115 -- -- \n", "116 -- -- \n", "117 -- -- \n", "118 -- -- \n", "119 -- -- \n", "\n", "[120 rows x 17 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "['What is the abbreviation of the module Information Processing within Organizations?', 'What is the ID of IT-Management?', 'What is the abbreviation of Project Seminar?', 'What is the code for Information Retrieval?', 'What is the abbreviation of the module Analysis and Design of Programs?', 'What is the code for Security of Software Systems?', 'What is the ID of Software Architecture?', 'What is the abbreviation of the module Artificial Intelligence 1?', 'What is the abbreviation of Discrete Event Simulation?', 'What is the code for Advanced Programming?', 'What is the code for Programming with neural nets?', 'What is the abbreviation of the module NLP and Text Mining?', 'What is the code for Systems Benchmarking?', 'What is the code for Computer Vision?', 'What is the abbreviation of Image Processing and Computational Photography?', 'What is the abbreviation of the module Multilingual NLP?']\n" ] } ], "source": [ "# path to the directory containing all csv files\n", "table_csv_path = \"table_csv\"\n", "\n", "item = grouped.iloc[0]\n", "table = pd.read_csv(\"table_csv/MS_IS_all_modules_orginal_cleaned.csv\").astype(str) \n", "\n", "display(table)\n", "print(\"\")\n", "print(item.question)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import TapasTokenizer\n", "# initialize the tokenizer\n", "tokenizer = TapasTokenizer.from_pretrained(\"google/tapas-base\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['input_ids', 'labels', 'numeric_values', 'numeric_values_scale', 'token_type_ids', 'attention_mask'])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding = tokenizer(table=table, queries=item.question, answer_coordinates=item.answer_coordinates, answer_text=item.answer_text,\n", " truncation=True, padding=\"max_length\", return_tensors=\"pt\")\n", "encoding.keys()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[CLS] what is the abbreviation of the module information processing within organizations? [SEP] module abbreviation module module etcs method duration module contents intended courses method allocation additional workload teaching referred information 12 faculty holder 5 numerical 1 graduate content after v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] it 12 faculty holder 5 numerical 1 graduate content after v a [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] project 12 faculty holder 15 numerical 1 graduate content after s project [EMPTY] [EMPTY] 450 [EMPTY] [EMPTY] information 10 institute dean 5 numerical 1 graduate ir the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] analysis 10 institute holder 5 numerical 1 graduate program the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] security 10 institute holder 5 numerical 1 graduate the students v written [EMPTY] focuses 150 [EMPTY] [EMPTY] software 10 institute holder 5 numerical 1 graduate current the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] artificial 10 institute holder 5 numerical 1 graduate intelligent the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] discrete 10 institute holder 8 numerical 1 graduate introduction the v written [EMPTY] focuses 240 [EMPTY] [EMPTY] advanced 10 institute holder 5 numerical 1 graduate with students v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] programming 10 institute holder 5 numerical 1 graduate overview knowledge v written [EMPTY] focuses 150 [EMPTY] [EMPTY] nlp 10 institute holder 5 numerical 1 graduate foundations the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] systems 10 institute holder 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] computer 10 institute dean 5 numerical 1 graduate the students v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] image 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] multilingual 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] statistical 10 institute holder 5 numerical 1 graduate networks the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] operations 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] machine 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] data 10 institute holder 5 numerical 1 graduate foundations the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] business 12 faculty holder 5 numerical 1 graduate content after v a 20 [EMPTY] 150 [EMPTY] [EMPTY] business 12 faculty holder 5 numerical 1 graduate content after v a 20 [EMPTY] 150 [EMPTY] [EMPTY] advanced 12 faculty holder 10 numerical 1 graduate in after s term 20 [EMPTY] 300 [EMPTY] [EMPTY] decision 12 faculty holder 5 numerical 1 graduate the after v a 40 [EMPTY] 150 [EMPTY] [EMPTY] analytical 12 faculty holder 5 numerical 1 graduate the the v written 40 [EMPTY] 150 [EMPTY] [EMPTY] business 12 faculty holder 10 numerical 1 graduate in the s term 20 [EMPTY] 300 [EMPTY] [EMPTY] e 12 faculty holder 5 numerical 1 graduate the - v a 40 [EMPTY] 150 [EMPTY] [EMPTY] mobile 12 faculty holder 5 numerical 1 graduate the - u a [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY]'" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode(encoding[\"input_ids\"][0])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['12-IV-161-m01']\n" ] } ], "source": [ "print(item.answer_text[0])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[CLS] 0\n", "what 0\n", "is 0\n", "the 0\n", "id 0\n", "of 0\n", "it 0\n", "- 0\n", "management 0\n", "? 0\n", "[SEP] 0\n", "module 0\n", "abbreviation 0\n", "module 0\n", "module 0\n", "etc 0\n", "##s 0\n", "method 0\n", "duration 0\n", "module 0\n", "contents 0\n", "intended 0\n", "courses 0\n", "method 0\n", "allocation 0\n", "additional 0\n", "work 0\n", "##load 0\n", "teaching 0\n", "referred 0\n", "information 0\n", "12 1\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "it 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "v 0\n", "a 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "project 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "15 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "s 0\n", "project 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "450 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "information 0\n", "10 0\n", "institute 0\n", "dean 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "ir 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "analysis 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "program 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "security 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "students 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "software 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "current 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "artificial 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "intelligent 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "discrete 0\n", "10 0\n", "institute 0\n", "holder 0\n", "8 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "introduction 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "240 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "advanced 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "with 0\n", "students 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "programming 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "overview 0\n", "knowledge 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "nl 0\n", "##p 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "foundations 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "systems 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "computer 0\n", "10 0\n", "institute 0\n", "dean 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "students 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "image 0\n", "10 0\n", "institute 0\n", "[EMPTY] 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "multi 0\n", "##ling 0\n", "##ual 0\n", "10 0\n", "institute 0\n", "[EMPTY] 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "statistical 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "networks 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "operations 0\n", "10 0\n", "institute 0\n", "[EMPTY] 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "machine 0\n", "10 0\n", "institute 0\n", "[EMPTY] 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "data 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "foundations 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "business 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "v 0\n", "a 0\n", "20 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "business 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "v 0\n", "a 0\n", "20 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "advanced 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "10 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "in 0\n", "after 0\n", "s 0\n", "term 0\n", "20 0\n", "[EMPTY] 0\n", "300 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "decision 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "after 0\n", "v 0\n", "a 0\n", "40 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "analytical 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "the 0\n", "v 0\n", "written 0\n", "40 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "business 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "10 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "in 0\n", "the 0\n", "s 0\n", "term 0\n", "20 0\n", "[EMPTY] 0\n", "300 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "e 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "- 0\n", "v 0\n", "a 0\n", "40 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "mobile 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "- 0\n", "u 0\n", "a 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n" ] } ], "source": [ "for id, prev_label in zip (encoding[\"input_ids\"][1], encoding[\"token_type_ids\"][1][:,3]):\n", " if id != 0: # we skip padding tokens\n", " print(tokenizer.decode([id]), prev_label.item())" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "class TableDataset(torch.utils.data.Dataset):\n", " def __init__(self, df, tokenizer):\n", " self.df = df\n", " self.tokenizer = tokenizer\n", "\n", " def __getitem__(self, idx):\n", " item = self.df.iloc[idx]\n", " table = pd.read_csv(\"table_csv/MS_IS_all_modules_orginal_cleaned.csv\").astype(str) # TapasTokenizer expects the table data to be text only\n", " if item.position != 0:\n", " # use the previous table-question pair to correctly set the prev_labels token type ids\n", " previous_item = self.df.iloc[idx-1]\n", " encoding = self.tokenizer(table=table, \n", " queries=[previous_item.question, item.question], \n", " answer_coordinates=[previous_item.answer_coordinates, item.answer_coordinates], \n", " answer_text=[previous_item.answer_text, item.answer_text],\n", " padding=\"max_length\",\n", " truncation=True,\n", " return_tensors=\"pt\"\n", " )\n", " # use encodings of second table-question pair in the batch\n", " encoding = {key: val[-1] for key, val in encoding.items()}\n", " else:\n", " # this means it's the first table-question pair in a sequence\n", " encoding = self.tokenizer(table=table, \n", " queries=item.question, \n", " answer_coordinates=item.answer_coordinates, \n", " answer_text=item.answer_text,\n", " padding=\"max_length\",\n", " truncation=True,\n", " return_tensors=\"pt\"\n", " )\n", " # remove the batch dimension which the tokenizer adds \n", " encoding = {key: val.squeeze(0) for key, val in encoding.items()}\n", " return encoding\n", "\n", " def __len__(self):\n", " return len(self.df)\n", "\n", "train_dataset = TableDataset(df=sqa_data, tokenizer=tokenizer)\n", "train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([512, 7])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_dataset[0][\"token_type_ids\"].shape" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "batch = next(iter(train_dataloader))" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([2, 512])" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch[\"input_ids\"].shape" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([2, 512, 7])" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch[\"token_type_ids\"].shape" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[CLS] what is the abbreviation of the module information processing within organizations? [SEP] module abbreviation module module etcs method duration module contents intended courses method allocation additional workload teaching referred information 12 faculty holder 5 numerical 1 graduate content after v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] it 12 faculty holder 5 numerical 1 graduate content after v a [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] project 12 faculty holder 15 numerical 1 graduate content after s project [EMPTY] [EMPTY] 450 [EMPTY] [EMPTY] information 10 institute dean 5 numerical 1 graduate ir the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] analysis 10 institute holder 5 numerical 1 graduate program the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] security 10 institute holder 5 numerical 1 graduate the students v written [EMPTY] focuses 150 [EMPTY] [EMPTY] software 10 institute holder 5 numerical 1 graduate current the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] artificial 10 institute holder 5 numerical 1 graduate intelligent the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] discrete 10 institute holder 8 numerical 1 graduate introduction the v written [EMPTY] focuses 240 [EMPTY] [EMPTY] advanced 10 institute holder 5 numerical 1 graduate with students v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] programming 10 institute holder 5 numerical 1 graduate overview knowledge v written [EMPTY] focuses 150 [EMPTY] [EMPTY] nlp 10 institute holder 5 numerical 1 graduate foundations the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] systems 10 institute holder 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] computer 10 institute dean 5 numerical 1 graduate the students v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] image 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] multilingual 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] statistical 10 institute holder 5 numerical 1 graduate networks the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] operations 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] machine 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] data 10 institute holder 5 numerical 1 graduate foundations the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] business 12 faculty holder 5 numerical 1 graduate content after v a 20 [EMPTY] 150 [EMPTY] [EMPTY] business 12 faculty holder 5 numerical 1 graduate content after v a 20 [EMPTY] 150 [EMPTY] [EMPTY] advanced 12 faculty holder 10 numerical 1 graduate in after s term 20 [EMPTY] 300 [EMPTY] [EMPTY] decision 12 faculty holder 5 numerical 1 graduate the after v a 40 [EMPTY] 150 [EMPTY] [EMPTY] analytical 12 faculty holder 5 numerical 1 graduate the the v written 40 [EMPTY] 150 [EMPTY] [EMPTY] business 12 faculty holder 10 numerical 1 graduate in the s term 20 [EMPTY] 300 [EMPTY] [EMPTY] e 12 faculty holder 5 numerical 1 graduate the - v a 40 [EMPTY] 150 [EMPTY] [EMPTY] mobile 12 faculty holder 5 numerical 1 graduate the - u a [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY]'" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode(batch[\"input_ids\"][0])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "#first example should not have any prev_labels set\n", "assert batch[\"token_type_ids\"][0][:,3].sum() == 0" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[CLS] what is the id of it - management? [SEP] module abbreviation module module etcs method duration module contents intended courses method allocation additional workload teaching referred information 12 faculty holder 5 numerical 1 graduate content after v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] it 12 faculty holder 5 numerical 1 graduate content after v a [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] project 12 faculty holder 15 numerical 1 graduate content after s project [EMPTY] [EMPTY] 450 [EMPTY] [EMPTY] information 10 institute dean 5 numerical 1 graduate ir the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] analysis 10 institute holder 5 numerical 1 graduate program the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] security 10 institute holder 5 numerical 1 graduate the students v written [EMPTY] focuses 150 [EMPTY] [EMPTY] software 10 institute holder 5 numerical 1 graduate current the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] artificial 10 institute holder 5 numerical 1 graduate intelligent the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] discrete 10 institute holder 8 numerical 1 graduate introduction the v written [EMPTY] focuses 240 [EMPTY] [EMPTY] advanced 10 institute holder 5 numerical 1 graduate with students v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] programming 10 institute holder 5 numerical 1 graduate overview knowledge v written [EMPTY] focuses 150 [EMPTY] [EMPTY] nlp 10 institute holder 5 numerical 1 graduate foundations the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] systems 10 institute holder 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] computer 10 institute dean 5 numerical 1 graduate the students v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] image 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] multilingual 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] statistical 10 institute holder 5 numerical 1 graduate networks the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] operations 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] machine 10 institute [EMPTY] 5 numerical 1 [EMPTY] [EMPTY] [EMPTY] v written [EMPTY] focuses 150 [EMPTY] [EMPTY] data 10 institute holder 5 numerical 1 graduate foundations the v written [EMPTY] focuses 150 [EMPTY] [EMPTY] business 12 faculty holder 5 numerical 1 graduate content after v a 20 [EMPTY] 150 [EMPTY] [EMPTY] business 12 faculty holder 5 numerical 1 graduate content after v a 20 [EMPTY] 150 [EMPTY] [EMPTY] advanced 12 faculty holder 10 numerical 1 graduate in after s term 20 [EMPTY] 300 [EMPTY] [EMPTY] decision 12 faculty holder 5 numerical 1 graduate the after v a 40 [EMPTY] 150 [EMPTY] [EMPTY] analytical 12 faculty holder 5 numerical 1 graduate the the v written 40 [EMPTY] 150 [EMPTY] [EMPTY] business 12 faculty holder 10 numerical 1 graduate in the s term 20 [EMPTY] 300 [EMPTY] [EMPTY] e 12 faculty holder 5 numerical 1 graduate the - v a 40 [EMPTY] 150 [EMPTY] [EMPTY] mobile 12 faculty holder 5 numerical 1 graduate the - u a [EMPTY] [EMPTY] 150 [EMPTY] [EMPTY] [PAD] [PAD] [PAD]'" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode(batch[\"input_ids\"][1])" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[CLS] 0\n", "what 0\n", "is 0\n", "the 0\n", "id 0\n", "of 0\n", "it 0\n", "- 0\n", "management 0\n", "? 0\n", "[SEP] 0\n", "module 0\n", "abbreviation 0\n", "module 0\n", "module 0\n", "etc 0\n", "##s 0\n", "method 0\n", "duration 0\n", "module 0\n", "contents 0\n", "intended 0\n", "courses 0\n", "method 0\n", "allocation 0\n", "additional 0\n", "work 0\n", "##load 0\n", "teaching 0\n", "referred 0\n", "information 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "it 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "v 0\n", "a 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "project 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "15 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "s 0\n", "project 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "450 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "information 0\n", "10 0\n", "institute 0\n", "dean 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "ir 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "analysis 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "program 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "security 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "students 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "software 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "current 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "artificial 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "intelligent 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "discrete 0\n", "10 0\n", "institute 0\n", "holder 0\n", "8 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "introduction 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "240 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "advanced 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "with 0\n", "students 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "programming 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "overview 0\n", "knowledge 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "nl 0\n", "##p 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "foundations 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "systems 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "computer 0\n", "10 0\n", "institute 0\n", "dean 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "students 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "image 0\n", "10 0\n", "institute 0\n", "[EMPTY] 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "multi 0\n", "##ling 0\n", "##ual 0\n", "10 0\n", "institute 0\n", "[EMPTY] 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "statistical 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "networks 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "operations 0\n", "10 0\n", "institute 0\n", "[EMPTY] 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "machine 0\n", "10 0\n", "institute 0\n", "[EMPTY] 0\n", "5 0\n", "numerical 0\n", "1 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "data 0\n", "10 0\n", "institute 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "foundations 0\n", "the 0\n", "v 0\n", "written 0\n", "[EMPTY] 0\n", "focuses 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "business 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "v 0\n", "a 0\n", "20 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "business 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "content 0\n", "after 0\n", "v 0\n", "a 0\n", "20 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "advanced 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "10 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "in 0\n", "after 0\n", "s 0\n", "term 0\n", "20 0\n", "[EMPTY] 0\n", "300 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "decision 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "after 0\n", "v 0\n", "a 0\n", "40 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "analytical 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "the 0\n", "v 0\n", "written 0\n", "40 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "business 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "10 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "in 0\n", "the 0\n", "s 0\n", "term 0\n", "20 0\n", "[EMPTY] 0\n", "300 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "e 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "- 0\n", "v 0\n", "a 0\n", "40 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "mobile 0\n", "12 0\n", "faculty 0\n", "holder 0\n", "5 0\n", "numerical 0\n", "1 0\n", "graduate 0\n", "the 0\n", "- 0\n", "u 0\n", "a 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n", "150 0\n", "[EMPTY] 0\n", "[EMPTY] 0\n" ] } ], "source": [ "for id, prev_label in zip(batch[\"input_ids\"][1], batch[\"token_type_ids\"][1][:,3]):\n", " if id != 0:\n", " print(tokenizer.decode([id]), prev_label.item())" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['output_weights', 'output_bias', 'column_output_bias', 'column_output_weights']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "data": { "text/plain": [ "TapasForQuestionAnswering(\n", " (tapas): TapasModel(\n", " (embeddings): TapasEmbeddings(\n", " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", " (position_embeddings): Embedding(1024, 768)\n", " (token_type_embeddings_0): Embedding(3, 768)\n", " (token_type_embeddings_1): Embedding(256, 768)\n", " (token_type_embeddings_2): Embedding(256, 768)\n", " (token_type_embeddings_3): Embedding(2, 768)\n", " (token_type_embeddings_4): Embedding(256, 768)\n", " (token_type_embeddings_5): Embedding(256, 768)\n", " (token_type_embeddings_6): Embedding(10, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.07, inplace=False)\n", " )\n", " (encoder): TapasEncoder(\n", " (layer): ModuleList(\n", " (0-11): 12 x TapasLayer(\n", " (attention): TapasAttention(\n", " (self): TapasSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (output): TapasSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.07, inplace=False)\n", " )\n", " )\n", " (intermediate): TapasIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): TapasOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.07, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (pooler): TapasPooler(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (activation): Tanh()\n", " )\n", " )\n", " (dropout): Dropout(p=0.07, inplace=False)\n", ")" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import TapasForQuestionAnswering\n", "\n", "model = TapasForQuestionAnswering.from_pretrained(\"google/tapas-base\")\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "model.to(device)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\FelixNeubauer\\anaconda3\\envs\\py38\\lib\\site-packages\\transformers\\optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch: 0\n", "Loss: 3.656862497329712\n", "Loss: 2.2393763065338135\n", "Loss: 1.0310397148132324\n", "Loss: 0.40564000606536865\n", "Loss: 0.27754873037338257\n", "Loss: 0.2089463472366333\n", "Loss: 0.18136566877365112\n", "Loss: 0.16886121034622192\n", "Epoch: 1\n", "Loss: 0.15656918287277222\n", "Loss: 0.1554507166147232\n", "Loss: 0.1592925786972046\n", "Loss: 0.15718212723731995\n", "Loss: 0.1562442183494568\n", "Loss: 0.15801993012428284\n", "Loss: 0.15340173244476318\n", "Loss: 0.1572810709476471\n", "Epoch: 2\n", "Loss: 0.14915084838867188\n", "Loss: 0.14941248297691345\n", "Loss: 0.1528778076171875\n", "Loss: 0.1508742719888687\n", "Loss: 0.1506081521511078\n", "Loss: 0.15169525146484375\n", "Loss: 0.14878258109092712\n", "Loss: 0.15179219841957092\n", "Epoch: 3\n", "Loss: 0.14902304112911224\n", "Loss: 0.14820906519889832\n", "Loss: 0.14895713329315186\n", "Loss: 0.14678704738616943\n", "Loss: 0.14645236730575562\n", "Loss: 0.14753571152687073\n", "Loss: 0.14482052624225616\n", "Loss: 0.14703711867332458\n", "Epoch: 4\n", "Loss: 0.1496383100748062\n", "Loss: 0.14670369029045105\n", "Loss: 0.14437901973724365\n", "Loss: 0.14174023270606995\n", "Loss: 0.14217612147331238\n", "Loss: 0.1428726762533188\n", "Loss: 0.14048904180526733\n", "Loss: 0.14207255840301514\n" ] } ], "source": [ "from transformers import AdamW\n", "\n", "optimizer = AdamW(model.parameters(), lr=5e-5)\n", "\n", "for epoch in range(5): # loop over the dataset multiple times\n", " print(\"Epoch:\", epoch)\n", " for idx, batch in enumerate(train_dataloader):\n", " # get the inputs;\n", " input_ids = batch[\"input_ids\"].to(device)\n", " attention_mask = batch[\"attention_mask\"].to(device)\n", " token_type_ids = batch[\"token_type_ids\"].to(device)\n", " labels = batch[\"labels\"].to(device)\n", " \n", " # zero the parameter gradients\n", " optimizer.zero_grad()\n", " # forward + backward + optimize\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,\n", " labels=labels)\n", " loss = outputs.loss\n", " print(\"Loss:\", loss.item())\n", " loss.backward()\n", " optimizer.step()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "import collections\n", "import numpy as np\n", "\n", "def compute_prediction_sequence(model, data, device):\n", " \"\"\"Computes predictions using model's answers to the previous questions.\"\"\"\n", " \n", " # prepare data\n", " input_ids = data[\"input_ids\"].to(device)\n", " attention_mask = data[\"attention_mask\"].to(device)\n", " token_type_ids = data[\"token_type_ids\"].to(device)\n", "\n", " all_logits = []\n", " prev_answers = None\n", "\n", " num_batch = data[\"input_ids\"].shape[0]\n", " \n", " for idx in range(num_batch):\n", " \n", " if prev_answers is not None:\n", " coords_to_answer = prev_answers[idx]\n", " # Next, set the label ids predicted by the model\n", " prev_label_ids_example = token_type_ids_example[:,3] # shape (seq_len,)\n", " model_label_ids = np.zeros_like(prev_label_ids_example.cpu().numpy()) # shape (seq_len,)\n", "\n", " # for each token in the sequence:\n", " token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)\n", " for i in range(model_label_ids.shape[0]):\n", " segment_id = token_type_ids_example[:,0].tolist()[i]\n", " col_id = token_type_ids_example[:,1].tolist()[i] - 1\n", " row_id = token_type_ids_example[:,2].tolist()[i] - 1\n", " if row_id >= 0 and col_id >= 0 and segment_id == 1:\n", " model_label_ids[i] = int(coords_to_answer[(col_id, row_id)])\n", "\n", " # set the prev label ids of the example (shape (1, seq_len) )\n", " token_type_ids_example[:,3] = torch.from_numpy(model_label_ids).type(torch.long).to(device) \n", "\n", " prev_answers = {}\n", " # get the example\n", " input_ids_example = input_ids[idx] # shape (seq_len,)\n", " attention_mask_example = attention_mask[idx] # shape (seq_len,)\n", " token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)\n", " # forward pass to obtain the logits\n", " outputs = model(input_ids=input_ids_example.unsqueeze(0), \n", " attention_mask=attention_mask_example.unsqueeze(0), \n", " token_type_ids=token_type_ids_example.unsqueeze(0))\n", " logits = outputs.logits\n", " all_logits.append(logits)\n", "\n", " # convert logits to probabilities (which are of shape (1, seq_len))\n", " dist_per_token = torch.distributions.Bernoulli(logits=logits)\n", " probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(dist_per_token.probs.device) \n", "\n", " # Compute average probability per cell, aggregating over tokens.\n", " # Dictionary maps coordinates to a list of one or more probabilities\n", " coords_to_probs = collections.defaultdict(list)\n", " prev_answers = {}\n", " for i, p in enumerate(probabilities.squeeze().tolist()):\n", " segment_id = token_type_ids_example[:,0].tolist()[i]\n", " col = token_type_ids_example[:,1].tolist()[i] - 1\n", " row = token_type_ids_example[:,2].tolist()[i] - 1\n", " if col >= 0 and row >= 0 and segment_id == 1:\n", " coords_to_probs[(col, row)].append(p)\n", "\n", " # Next, map cell coordinates to 1 or 0 (depending on whether the mean prob of all cell tokens is > 0.5)\n", " coords_to_answer = {}\n", " for key in coords_to_probs:\n", " coords_to_answer[key] = np.array(coords_to_probs[key]).mean() > 0.5\n", " prev_answers[idx+1] = coords_to_answer\n", " \n", " logits_batch = torch.cat(tuple(all_logits), 0)\n", " \n", " return logits_batch" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "\n", "queries = [\"What is the abbreviation of the module Information Processing within Organizations?\"]\n", "\n", "table = pd.read_csv(\"MS_IS_all_modules_orginal_15_rows_cleaned.csv\")\n", "table = table.astype(str)\n", "\n", "inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors=\"pt\", Truncation=True)\n", "logits = compute_prediction_sequence(model, inputs, device)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "predicted_answer_coordinates, = tokenizer.convert_logits_to_predictions(inputs, logits.cpu().detach())" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Module titleAbbreviationModule coordinatorModule offered byETCSMethod of gradingDurationModule levelContentsIntended learning outcomesCoursesMethod of assessmentAllocation of placesAdditional informationWorkloadTeaching cycleReferred to in LPO I
0Information Processing within Organizations12-IV-161-m01Faculty of Business Management and Economicsholder of the Chair of Business Management and...5numerical grade1 semestergraduateContent:This course provides students with an ...After completing the course \"Integrated Inform...V (2) + Ü (2)written examination (approx. 60 minutes)Langua...----150 h----
1IT-Management12-M-ITM-161-m01Faculty of Business Management and EconomicsHolder of the Chair of Information Systems Eng...5numerical grade1 semestergraduateContent:This course provides students with an ...After completing the course \"IT Management\", s...V (2) + Ü (2)a) written examination (approx. 60 minutes) or...----150 h----
2Project Seminar12-PS-192-m01Faculty of Business Management and EconomicsHolder of the Chair of Business Management and...15numerical grade1 semestergraduateContent:In small project teams of 4 to 10 memb...After completing the course \"Projektseminar\", ...S (2)project: preparing a conceptual design (approx...----450 h----
3Information Retrieval10-I=IR-161-m01Institute of Computer ScienceDean of Studies Informatik (Computer Science)5numerical grade1 semestergraduateIR models (e. g. Boolean and vector space mode...The students possess theoretical and practical...V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
4Analysis and Design of Programs10-I=PA-161-m01Institute of Computer Scienceholder of the Chair of Computer Science II5numerical grade1 semestergraduateProgram analysis, model creation in software e...The students are able to analyse programs, to ...V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
5Security of Software Systems10-I=SSS-172-m01Institute of Computer Scienceholder of the Chair of Computer Science II5numerical grade1 semestergraduateThe lecture provides an overview of common sof...Students gain a deep understanding of software...V (2) + Ü (2)Module taught in: Englishwritten examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
6Software Architecture10-I=SAR-161-m01Institute of Computer Scienceholder of the Chair of Computer Science II5numerical grade1 semestergraduateCurrent topics in the area of aerospace.The students possess a fundamental and applica...V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
7Artificial Intelligence 110-I=KI1-161-m01Institute of Computer Scienceholder of the Chair of Computer Science VI5numerical grade1 semestergraduateIntelligent agents, uninformed and heuristic s...The students possess theoretical and practical...V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
8Discrete Event Simulation10-I=ST-161-m01Institute of Computer Scienceholder of the Chair of Computer Science III8numerical grade1 semestergraduateIntroduction to simulation techniques, statist...The students possess the methodic knowledge an...V (4) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...240 h----
9Advanced Programming10-I=APR-182-m01Institute of Computer Scienceholder of the Chair of Computer Science II5numerical grade1 semestergraduateWith the knowledge of basic programming, taugh...Students learn advanced programming paradigms ...V (2) + Ü (2)Module taught in: Englishwritten examination (90 to 120 minutes)Languag...----150 h----
10Programming with neural nets10-I=PNN-212-m01Institute of Computer Scienceholder of the Chair of Computer Science IX5numerical grade1 semestergraduateOverview over NN, implementation of important ...Knowledge about possible applications and limi...V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
11NLP and Text Mining10-I=STM-162-m01Institute of Computer Scienceholder of the Chair of Computer Science VI5numerical grade1 semestergraduateFoundations in the following areas: definition...The students possess theoretical and practical...V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
12Systems Benchmarking10-I=SB-212-m01Institute of Computer Scienceholder of the Chair of Computer Science IX5numerical grade1 semesternan----V (2) + Ü (2)written examination (approx. 60 to 120 minutes...--Focuses available for students of the Masters ...150 h----
13Computer Vision10-xtAI=CV-202-m01Institute of Computer ScienceDean of Studies Informatik (Computer Science)5numerical grade1 semestergraduateThe lecture provides knowledge about current m...Students have fundamental knowledge of problem...V (2) + Ü (2)Module taught in: EnglishWritten examination (approx. 60 to 120 minutes...----150 h----
14Image Processing and Computational Photography10-I=IP-222-m01Institute of Computer Sciencenan5numerical grade1 semesternan----V (2) + Ü (2)Module taught in: Englishwritten examination (approx. 60 to 120 minutes...----150 h----
\n", "
" ], "text/plain": [ " Module title Abbreviation \\\n", "0 Information Processing within Organizations 12-IV-161-m01 \n", "1 IT-Management 12-M-ITM-161-m01 \n", "2 Project Seminar 12-PS-192-m01 \n", "3 Information Retrieval 10-I=IR-161-m01 \n", "4 Analysis and Design of Programs 10-I=PA-161-m01 \n", "5 Security of Software Systems 10-I=SSS-172-m01 \n", "6 Software Architecture 10-I=SAR-161-m01 \n", "7 Artificial Intelligence 1 10-I=KI1-161-m01 \n", "8 Discrete Event Simulation 10-I=ST-161-m01 \n", "9 Advanced Programming 10-I=APR-182-m01 \n", "10 Programming with neural nets 10-I=PNN-212-m01 \n", "11 NLP and Text Mining 10-I=STM-162-m01 \n", "12 Systems Benchmarking 10-I=SB-212-m01 \n", "13 Computer Vision 10-xtAI=CV-202-m01 \n", "14 Image Processing and Computational Photography 10-I=IP-222-m01 \n", "\n", " Module coordinator \\\n", "0 Faculty of Business Management and Economics \n", "1 Faculty of Business Management and Economics \n", "2 Faculty of Business Management and Economics \n", "3 Institute of Computer Science \n", "4 Institute of Computer Science \n", "5 Institute of Computer Science \n", "6 Institute of Computer Science \n", "7 Institute of Computer Science \n", "8 Institute of Computer Science \n", "9 Institute of Computer Science \n", "10 Institute of Computer Science \n", "11 Institute of Computer Science \n", "12 Institute of Computer Science \n", "13 Institute of Computer Science \n", "14 Institute of Computer Science \n", "\n", " Module offered by ETCS Method of grading \\\n", "0 holder of the Chair of Business Management and... 5 numerical grade \n", "1 Holder of the Chair of Information Systems Eng... 5 numerical grade \n", "2 Holder of the Chair of Business Management and... 15 numerical grade \n", "3 Dean of Studies Informatik (Computer Science) 5 numerical grade \n", "4 holder of the Chair of Computer Science II 5 numerical grade \n", "5 holder of the Chair of Computer Science II 5 numerical grade \n", "6 holder of the Chair of Computer Science II 5 numerical grade \n", "7 holder of the Chair of Computer Science VI 5 numerical grade \n", "8 holder of the Chair of Computer Science III 8 numerical grade \n", "9 holder of the Chair of Computer Science II 5 numerical grade \n", "10 holder of the Chair of Computer Science IX 5 numerical grade \n", "11 holder of the Chair of Computer Science VI 5 numerical grade \n", "12 holder of the Chair of Computer Science IX 5 numerical grade \n", "13 Dean of Studies Informatik (Computer Science) 5 numerical grade \n", "14 nan 5 numerical grade \n", "\n", " Duration Module level \\\n", "0 1 semester graduate \n", "1 1 semester graduate \n", "2 1 semester graduate \n", "3 1 semester graduate \n", "4 1 semester graduate \n", "5 1 semester graduate \n", "6 1 semester graduate \n", "7 1 semester graduate \n", "8 1 semester graduate \n", "9 1 semester graduate \n", "10 1 semester graduate \n", "11 1 semester graduate \n", "12 1 semester nan \n", "13 1 semester graduate \n", "14 1 semester nan \n", "\n", " Contents \\\n", "0 Content:This course provides students with an ... \n", "1 Content:This course provides students with an ... \n", "2 Content:In small project teams of 4 to 10 memb... \n", "3 IR models (e. g. Boolean and vector space mode... \n", "4 Program analysis, model creation in software e... \n", "5 The lecture provides an overview of common sof... \n", "6 Current topics in the area of aerospace. \n", "7 Intelligent agents, uninformed and heuristic s... \n", "8 Introduction to simulation techniques, statist... \n", "9 With the knowledge of basic programming, taugh... \n", "10 Overview over NN, implementation of important ... \n", "11 Foundations in the following areas: definition... \n", "12 -- \n", "13 The lecture provides knowledge about current m... \n", "14 -- \n", "\n", " Intended learning outcomes \\\n", "0 After completing the course \"Integrated Inform... \n", "1 After completing the course \"IT Management\", s... \n", "2 After completing the course \"Projektseminar\", ... \n", "3 The students possess theoretical and practical... \n", "4 The students are able to analyse programs, to ... \n", "5 Students gain a deep understanding of software... \n", "6 The students possess a fundamental and applica... \n", "7 The students possess theoretical and practical... \n", "8 The students possess the methodic knowledge an... \n", "9 Students learn advanced programming paradigms ... \n", "10 Knowledge about possible applications and limi... \n", "11 The students possess theoretical and practical... \n", "12 -- \n", "13 Students have fundamental knowledge of problem... \n", "14 -- \n", "\n", " Courses \\\n", "0 V (2) + Ü (2) \n", "1 V (2) + Ü (2) \n", "2 S (2) \n", "3 V (2) + Ü (2) \n", "4 V (2) + Ü (2) \n", "5 V (2) + Ü (2)Module taught in: English \n", "6 V (2) + Ü (2) \n", "7 V (2) + Ü (2) \n", "8 V (4) + Ü (2) \n", "9 V (2) + Ü (2)Module taught in: English \n", "10 V (2) + Ü (2) \n", "11 V (2) + Ü (2) \n", "12 V (2) + Ü (2) \n", "13 V (2) + Ü (2)Module taught in: English \n", "14 V (2) + Ü (2)Module taught in: English \n", "\n", " Method of assessment Allocation of places \\\n", "0 written examination (approx. 60 minutes)Langua... -- \n", "1 a) written examination (approx. 60 minutes) or... -- \n", "2 project: preparing a conceptual design (approx... -- \n", "3 written examination (approx. 60 to 120 minutes... -- \n", "4 written examination (approx. 60 to 120 minutes... -- \n", "5 written examination (approx. 60 to 120 minutes... -- \n", "6 written examination (approx. 60 to 120 minutes... -- \n", "7 written examination (approx. 60 to 120 minutes... -- \n", "8 written examination (approx. 60 to 120 minutes... -- \n", "9 written examination (90 to 120 minutes)Languag... -- \n", "10 written examination (approx. 60 to 120 minutes... -- \n", "11 written examination (approx. 60 to 120 minutes... -- \n", "12 written examination (approx. 60 to 120 minutes... -- \n", "13 Written examination (approx. 60 to 120 minutes... -- \n", "14 written examination (approx. 60 to 120 minutes... -- \n", "\n", " Additional information Workload Teaching cycle \\\n", "0 -- 150 h -- \n", "1 -- 150 h -- \n", "2 -- 450 h -- \n", "3 Focuses available for students of the Masters ... 150 h -- \n", "4 Focuses available for students of the Masters ... 150 h -- \n", "5 Focuses available for students of the Masters ... 150 h -- \n", "6 Focuses available for students of the Masters ... 150 h -- \n", "7 Focuses available for students of the Masters ... 150 h -- \n", "8 Focuses available for students of the Masters ... 240 h -- \n", "9 -- 150 h -- \n", "10 Focuses available for students of the Masters ... 150 h -- \n", "11 Focuses available for students of the Masters ... 150 h -- \n", "12 Focuses available for students of the Masters ... 150 h -- \n", "13 -- 150 h -- \n", "14 -- 150 h -- \n", "\n", " Referred to in LPO I \n", "0 -- \n", "1 -- \n", "2 -- \n", "3 -- \n", "4 -- \n", "5 -- \n", "6 -- \n", "7 -- \n", "8 -- \n", "9 -- \n", "10 -- \n", "11 -- \n", "12 -- \n", "13 -- \n", "14 -- " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "What is the abbreviation of the module Information Processing within Organizations?\n", "Predicted answer: \n" ] } ], "source": [ "# handy helper function in case inference on Pandas dataframe\n", "answers = []\n", "for coordinates in predicted_answer_coordinates:\n", " if len(coordinates) == 1:\n", " # only a single cell:\n", " answers.append(table.iat[coordinates[0]])\n", " else:\n", " # multiple cells\n", " cell_values = []\n", " for coordinate in coordinates:\n", " cell_values.append(table.iat[coordinate])\n", " answers.append(\", \".join(cell_values))\n", "\n", "display(table)\n", "print(\"\")\n", "for query, answer in zip(queries, answers):\n", " print(query)\n", " print(\"Predicted answer: \" + answer)" ] } ], "metadata": { "kernelspec": { "display_name": "py38", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }