import pdfplumber import re import codecs def convert(file_path=False): if not file_path: import tkinter as tk from tkinter import filedialog root = tk.Tk() root.withdraw() file_path = filedialog.askopenfilename( filetypes=[("PDFs", ".pdf")], title="Datei zum konvertieren auswählen!" ) conv_string = [] # open PDF with pdfplumber.open(file_path) as pdf: for page in pdf.pages: crop = page.crop((60, 80, page.width, page.height)) # first_page = pdf.pages[0] # first_page = first_page.crop((60, 80, first_page.width, first_page.height)) text = crop.extract_text(layout=True) no_trail = re.sub("\ +\\n", "\n", text) # cleared trailing spaces no_wrong_nl = re.sub( "\\n\ +([A-Za-z0-9])", r" \1", no_trail ) # clear wrong newlins lines = re.split("\n", no_wrong_nl) # split into lines test = 1 changed_lines = [] for line in lines: line, num = re.subn("(:)(.+)", rf"\1 {{{{c{test}::\2}}}}", line) if num > 0: test += 1 changed_lines.append(line) new_str = "\n".join(changed_lines).strip() conv_string.append(new_str) conv_string = "#################### neue Seite ####################\n".join( conv_string ) file_path = file_path.replace(".pdf", ".txt") text_file = codecs.open(file_path, "w", "utf-8") text_file.write(conv_string) text_file.close() print(f"Alles fertig, die Datei befindet sich unter {file_path}") if not __name__ == "__main__": return conv_string def convert_text(text): text = str(text) if "\r\n" in text: text = text.replace("\r\n", "\n") no_wrong_nl = re.sub("\\n\ +([A-Za-z0-9])", r" \1", text) # clear wrong newlins lines = re.split("\n", no_wrong_nl) # split into lines test = 1 changed_lines = [] for line in lines: line, num = re.subn("(:)(.+)", rf"\1 {{{{c{test}::\2}}}}", line) if num > 0: test += 1 changed_lines.append(line) new_str = "\n".join(changed_lines).strip() if not __name__ == "__main__": return new_str if __name__ == "__main__": convert()