79 lines
2.4 KiB
Python
Executable File
79 lines
2.4 KiB
Python
Executable File
import pdfplumber
|
|
import re
|
|
|
|
import codecs
|
|
|
|
|
|
def convert(file_path=False):
|
|
if not file_path:
|
|
import tkinter as tk
|
|
from tkinter import filedialog
|
|
|
|
root = tk.Tk()
|
|
root.withdraw()
|
|
file_path = filedialog.askopenfilename(
|
|
filetypes=[("PDFs", ".pdf")], title="Datei zum konvertieren auswählen!"
|
|
)
|
|
|
|
conv_string = []
|
|
|
|
# open PDF
|
|
with pdfplumber.open(file_path) as pdf:
|
|
for page in pdf.pages:
|
|
crop = page.crop((60, 80, page.width, page.height))
|
|
# first_page = pdf.pages[0]
|
|
# first_page = first_page.crop((60, 80, first_page.width, first_page.height))
|
|
text = crop.extract_text(layout=True)
|
|
no_trail = re.sub("\ +\\n", "\n", text) # cleared trailing spaces
|
|
no_wrong_nl = re.sub(
|
|
"\\n\ +([A-Za-z0-9])", r" \1", no_trail
|
|
) # clear wrong newlins
|
|
lines = re.split("\n", no_wrong_nl) # split into lines
|
|
|
|
test = 1
|
|
changed_lines = []
|
|
for line in lines:
|
|
line, num = re.subn("(:)(.+)", rf"\1 {{{{c{test}::\2}}}}", line)
|
|
if num > 0:
|
|
test += 1
|
|
changed_lines.append(line)
|
|
new_str = "\n".join(changed_lines).strip()
|
|
conv_string.append(new_str)
|
|
|
|
conv_string = "#################### neue Seite ####################\n".join(
|
|
conv_string
|
|
)
|
|
|
|
file_path = file_path.replace(".pdf", ".txt")
|
|
text_file = codecs.open(file_path, "w", "utf-8")
|
|
text_file.write(conv_string)
|
|
text_file.close()
|
|
|
|
print(f"Alles fertig, die Datei befindet sich unter {file_path}")
|
|
if not __name__ == "__main__":
|
|
return conv_string
|
|
|
|
|
|
def convert_text(text):
|
|
text = str(text)
|
|
if "\r\n" in text:
|
|
text = text.replace("\r\n", "\n")
|
|
|
|
no_wrong_nl = re.sub("\\n\ +([A-Za-z0-9])", r" \1", text) # clear wrong newlins
|
|
lines = re.split("\n", no_wrong_nl) # split into lines
|
|
|
|
test = 1
|
|
changed_lines = []
|
|
for line in lines:
|
|
line, num = re.subn("(:)(.+)", rf"\1 {{{{c{test}::\2}}}}", line)
|
|
if num > 0:
|
|
test += 1
|
|
changed_lines.append(line)
|
|
new_str = "\n".join(changed_lines).strip()
|
|
if not __name__ == "__main__":
|
|
return new_str
|
|
|
|
|
|
if __name__ == "__main__":
|
|
convert()
|