In [None]:
# Necessary pip installs: 
# pip install pandas
# pip install pdfminer.six
# pip install xlsxwriter

In [16]:
# moduleCatalogue paths 

# Masters
MS_IS_all_modules = "./module_catalogues/MS_IS_all_modules.pdf"

MS_MM_all_modules = "./module_catalogues/MS_MM_all_modules.pdf"


# Bachelors
BA_IS_all_modules = "./module_catalogues/BA_IS_all_modules.pdf"

BA_MM_all_modules = "./module_catalogues/BA_MM_all_modules.pdf"


In [11]:
import re
from pdfminer.high_level import extract_text
import pandas as pd

# Read PDF file
text_Module_Catalogue = extract_text(BA_MM_all_modules)


In [48]:
# Pattern to remove Master Information Systems
removal_patterns_MS_IS = [
 r"Module Catalogue for the Subject\nInformation Systems\nMaster’s with 1 major, 120 ECTS credits",
 r"JMU\sWürzburg\s•\sgenerated\s\d{1,2}-[A-Za-z]+-\d{4}\s•\sexam\.\sreg\.\sda-\nta\srecord\sMaster\s\(120\sECTS\)\sInformation\sSystems\s-\s\d{4}",
 r"Master’s\s+with\s+1\s+major\s+Information\s+Systems\s+\(\d{4}\)",
 r"page\s+\d+\s+/\s+\d+",
 r'^\s*$'
 ]

# Pattern to remove Bachelor Information Systems

removal_patterns_BA_IS = [
 r"Module Catalogue for the Subject\nBusiness Information Systems\nBachelor’s with 1 major, 180 ECTS credits",
 r"JMU\sWürzburg\s•\sgenerated\s\d{1,2}-[A-Za-z]+-\d{4}\s•\sexam\.\sreg\.\sda-\nta\srecord\sBachelor\s\(180\sECTS\)\sWirtschaftsinformatik\s-\s\d{4}",
 r"Bachelor’s\s+with\s+1\s+major\s+Business\s+Information\s+Sy-\n+stems\s+\(\d{4}\)",
 r"page\s+\d+\s+/\s+\d+",
 r'^\s*$'
 ]

# Pattern to remove Bachlor Wirtschaftswissenschaften
removal_patterns_MS_MM = [
 r"Module Catalogue for the Subject\nManagement\nMaster’s with 1 major, 120 ECTS credits",
 r"JMU\sWürzburg\s•\sgenerated\s11-Mai-2023\s•\sexam\.\sreg\.\s\ndata\srecord\sMaster\s\(120\sECTS\)\sManagement\s-\s2018",
 r"Master’s\s+with\s+1\s+major\s+Management\s+\(\d{4}\)",
 r"page\s+\d+\s+/\s+\d+",
 r'^\s*$'
 ]

removal_patterns_BA_MM = [
 r"Module Catalogue for the Subject\nBusiness Management and Economics\nBachelor’s with 1 major, 180 ECTS credits",
 r"JMU Würzburg • generated \d{2}-[A-Za-z]{3}-\d{4} • exam\. reg\. data re-[\s\S]*?Bachelor \(180 ECTS\) Wirtschaftswissenschaft - 2008",
 r"Bachelor’s\s+with\s+1\s+major\s+Business\s+Management\s+and\s+Economics\s+\(\d{4}\)",
 r"page\s+\d+\s+/\s+\d+",
 r'^\s*$'
 ]


In [50]:
# regex patterns to get attributes of Master Information Systems
from enum import Enum

class Patterns_MS_IS(Enum):
 PATTERN_ENTIRE_MODULE = r"Module title[\s\S]*?(?=Module title|$)"
 MODULE_TITLE = r'Module title\s*\n*\s*(.*)'
 ABBREVIATION = r'Abbreviation\s*\n*\s*(.*)'
 MODULE_OFFERED_BY = r"^(Faculty|Institute).*"
 MODULE_COORDINATOR = r"^(Holder|holder|Dean).*"
 ETCS = r"^\d{1,2}$"
 METHOD_GRADING = r".*(not\s)?successfully completed|numerical grade.*"
 DURATION = r"^\d\ssemester$"
 MODULE_LEVEL = r"^(?:graduate|undergraduate)$"
 CONTENTS = r'Contents([\s\S]*?)Intended learning outcomes'
 INTENDED_LEARNING_OUTCOMES = r'Intended learning outcomes\n\n([\s\S]*?)\n\nCourses \(type'
 COURSES = r'if other than German\)([\s\S]*?)Method of assessment'
 ASSESSMENT = r'whether\s*\nmodule is creditable for bonus\)([\s\S]*?)Allocation of places'
 ALLOCATION = r'Allocation of places([\s\S]*?)Additional information'
 ADDITIONAL_INFORMATION = r'Additional information([\s\S]*?)Workload'
 WORKLOAD = r'Workload([\s\S]*?)Teaching cycle'
 TEACHING_CYCLE = r'Teaching cycle([\s\S]*?)Referred to in LPO I'
 REFERRED_LPO = r'regulations for teaching-degree programmes\)([\s\S]*?)Module appears in'


In [51]:
from helper_methods import extract_first_match, extract_LineMatch, clean_entries
import xlsxwriter

# Extract modules to xlsx -> Method shall be used generically for all modules later

def extract_modules_to_xlsx (text, patternsToRemove, file_path):

 modules = re.findall(Patterns_MS_IS.PATTERN_ENTIRE_MODULE.value, text)
 modules = clean_entries(modules, patternsToRemove)

 workbook = xlsxwriter.Workbook(file_path)
 worksheet = workbook.add_worksheet()

 # set columns
 column_names = ['Module title', 'Abbreviation', 'Module coordinator', 'Module offered by', 'ETCS', 'Method of grading',
 'Duration', 'Module level', 'Contents', 'Intended learning outcomes', 'Courses', 'Method of assessment',
 'Allocation of places', 'Additional information', 'Workload', 'Teaching cycle', 'Referred to in LPO I']
 
 for i in range(len(column_names)):
 worksheet.write(0, i, column_names[i])

 counter = 1
 # Extract module attributes
 for i in range(len(modules)):
 module_attributes = []
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.MODULE_TITLE.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.ABBREVIATION.value))
 module_attributes.append(extract_LineMatch(modules[i], Patterns_MS_IS.MODULE_OFFERED_BY.value))
 module_attributes.append(extract_LineMatch(modules[i], Patterns_MS_IS.MODULE_COORDINATOR.value))
 module_attributes.append(extract_LineMatch(modules[i], Patterns_MS_IS.ETCS.value))
 module_attributes.append(extract_LineMatch(modules[i], Patterns_MS_IS.METHOD_GRADING.value))
 module_attributes.append(extract_LineMatch(modules[i], Patterns_MS_IS.DURATION.value))
 module_attributes.append(extract_LineMatch(modules[i], Patterns_MS_IS.MODULE_LEVEL.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.CONTENTS.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.INTENDED_LEARNING_OUTCOMES.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.COURSES.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.ASSESSMENT.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.ALLOCATION.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.ADDITIONAL_INFORMATION.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.WORKLOAD.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.TEACHING_CYCLE.value))
 module_attributes.append(extract_first_match(modules[i], Patterns_MS_IS.REFERRED_LPO.value))
 
 # Write to xlsx file
 for j in range(len(module_attributes)):
 worksheet.write(counter, j, module_attributes[j])
 
 counter += 1
 workbook.close()




In [52]:
# Use write to xlsx method for Master Information Systems

extract_modules_to_xlsx(text_Module_Catalogue, removal_patterns_MS_MM, "BA_MM_all_modules.xlsx")


In [49]:
modules = re.findall(Patterns_MS_IS.PATTERN_ENTIRE_MODULE.value, text_Module_Catalogue)
modules = clean_entries(modules, removal_patterns_BA_MM)

for i in range (3):
 print(modules[i])

Module title

Introduction to Market-Oriented Management

Abbreviation

12-Mark-G-082-m01

Module coordinator

Module offered by

holder of the Chair of Business Management and Marke-
ting

Faculty of Business Management and Economics

ECTS Method of grading

Only after succ. compl. of module(s)

5

numerical grade

--

Duration

Module level

Other prerequisites

1 semester

undergraduate

--

Contents

Description
In this module, students will acquire the theoretical foundations of market-oriented management.

Content:
With the stakeholder approach as a starting point, the basic design of market-oriented management will be ex-
plained and exemplified in the 5 classical steps: situation analysis, objectives, strategies, tools and control-
ling. The course will focus not only on the behavioural approaches of consumer behaviour but also on industri-
al purchasing behaviour. A case study introducing students to the fundamental principles of market research ba-
sed on a conjoint analysis 