Yiming Qian commited on
Commit
f8c6dd9
1 Parent(s): 8cb44d1

Upload pdf_parser.py

Browse files
Files changed (1) hide show
  1. pdf_parser.py +117 -0
pdf_parser.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import pymupdf
3
+ from bs4 import BeautifulSoup
4
+ import pickle
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
+
8
+
9
+ model_kwargs = dict(
10
+ use_cache=False,
11
+ trust_remote_code=True,
12
+ attn_implementation="flash_attention_2", # loading the model with flash-attenstion support
13
+ torch_dtype=torch.bfloat16,
14
+ device_map="cuda",
15
+ load_in_4bit=True
16
+ )
17
+
18
+ model = AutoModelForCausalLM.from_pretrained("./model_4bit", **model_kwargs)
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained("./model_4bit")
21
+ tokenizer.model_max_length = 8000
22
+ tokenizer.pad_token = tokenizer.eos_token # use unk rather than eos token to prevent endless generation
23
+ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
24
+ tokenizer.padding_side = 'right'
25
+
26
+
27
+ SYSTEM = '''The user's input is data in XML format. Please organize it into a markdown format. Pay attention to:
28
+
29
+ 1. Directly output the results. Do not make summary of the text.
30
+ 2. Do not alter any text from the XML. Do not change number into words.
31
+ 3. Correct format errors, such as misalignment between numbers and text, and disorder in the sequence of table cells.
32
+ 4. Use markdown, but all numbers must be explicitly written out in full (e.g., 3.2.5.1).
33
+ 5. Preserve the original document structure as much as possible, such as paragraphs, lists, etc.
34
+ 6. Pay attention to detecting tables in the text (as the table format may have been lost due to copying from the XML). Restore the table's format and maintain its integrity. Some tables may be too long and span across pages. Pay attention to merging the same tables that span pages. Properly handle table headers to avoid repetition or omission.
35
+ 7. Text from the XML may contain some garbled characters; remove any characters that are garbled.
36
+ 8. Convert headings (H1, H2, H3, etc.) into their respective Markdown heading levels (e.g., 3 for # 3, 3.2 for ## 3.2, 3.2.1 for ### 3.2.1).
37
+ 9. Include metadata information in the output, such as document title, section number, etc.
38
+ 10. Remove the footnote and page number, it is important!!!
39
+ 11. Make sure phrase connected with - will not break up.
40
+ '''
41
+
42
+ def merge_elements_up_to_max_length(elements, max_length):
43
+ """
44
+ Merge elements in the list to ensure no element exceeds the specified max_length.
45
+
46
+ Parameters:
47
+ - elements: List[str] - The list of string elements to merge.
48
+ - max_length: int - The maximum allowed length for any element after merging.
49
+
50
+ Returns:
51
+ - List[str]: A new list where the elements have been merged as necessary.
52
+ """
53
+ if not elements:
54
+ return []
55
+
56
+ # Initialize the list with the first element
57
+ merged = [elements[0]]
58
+
59
+ for element in elements[1:]:
60
+ # Check if the last element in merged list can be combined with the current element
61
+ if len(merged[-1]) + len(element) <= max_length:
62
+ merged[-1] += element # Merge with the last element
63
+ else:
64
+ merged.append(element) # Add as a new element
65
+
66
+ return merged
67
+
68
+
69
+ pipe = pipeline(
70
+ "text-generation",
71
+ model=model,
72
+ tokenizer=tokenizer,
73
+ )
74
+
75
+ generation_args = {
76
+ "max_new_tokens": 2000,
77
+ "return_full_text": False,
78
+ "do_sample": False,
79
+ }
80
+
81
+ # %%
82
+ filename ='2023071000529.pdf'
83
+ elements=[]
84
+ with pymupdf.open(filename) as doc:
85
+
86
+ for page in doc:
87
+ soup = BeautifulSoup(page.get_text('xhtml'), 'html.parser')
88
+ for img in soup("img"):
89
+ img.decompose()
90
+
91
+ page_element=''
92
+ for item in soup.find_all('p'):
93
+ if len(item.get_text())<2:
94
+ item.decompose()
95
+ else:
96
+ #elements.append(str(item))
97
+ page_element=page_element+str(item)
98
+ elements.append(page_element)
99
+ elements.append("<hr>")
100
+
101
+ max_length=7000
102
+
103
+ merged_elements=merge_elements_up_to_max_length(elements, max_length)
104
+
105
+ markdown_text=''
106
+ for j in range(len(merged_elements)):
107
+ item =merged_elements[j]
108
+ messages=[{"role": "system", "content": SYSTEM},
109
+ {"role": "user", "content": item}]
110
+ output = pipe(messages, **generation_args)
111
+ markdown_text=markdown_text+output[0]['generated_text']+'\n'
112
+
113
+ main_file = filename[:-4]+'.md'
114
+ with open(main_file, "w") as f:
115
+ f.write(markdown_text)
116
+
117
+