+Warwagon MVC Posted January 5, 2024 MVC Share Posted January 5, 2024 Updated the Python PDF name search app. Give it a TXT file with a list of first names. Give it a dictionary full of english words. Give it a PDF to search It looks through the PDF for a first name match. Then, it takes the next word following the first name and checks the English dictionary to see if it is an English word. As an example, if it finds "Scarlett drove the car to the property," it would find "drove" in the dictionary and skip it. If it finds "Scarlett Wilberding drove the car to the property," it takes the word "Wilberding" and sees that it's not in the dictionary and then records "Scarlett Wilberding" along with the page number it was found on behind it. Each name is recorded once, and the page numbers on which it appears show up in the back of its name. I updated the app for the Epstein list, which is supposed to drop today.. In the example below, I used the one of the public Epstein. Court document You can either copy pasted code or grab the .py file and names.txt and dictionary.txt From this dropbox linkhttps://www.dropbox.com/scl/fo/06zqmpi5xstjh5w7vawdj/ANmedXX0X4BkNEoHtTQ1c50?rlkey=53892676fmz51ua83elhnzk0a&st=5qrqjsfj&dl=0 import sys from PyQt5.QtWidgets import ( QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, QFileDialog, QPushButton, QTextEdit, QLineEdit, QLabel, QProgressBar, QMessageBox, ) from PyQt5.QtCore import Qt, QThread, pyqtSignal import fitz # PyMuPDF import re class SearchThread(QThread): update_progress = pyqtSignal(int) # Signal to update progress bar update_results = pyqtSignal(str, list) # Signal to update results text (name, pages) def __init__(self, pdf_path, names, dictionary, stop_flag): super().__init__() self.pdf_path = pdf_path self.names = names self.dictionary = dictionary self.stop_flag = stop_flag self.results = {} # Stores results as {name: [pages]} def run(self): self.update_results.emit("Startingch...\n", []) pdf_document = fitz.open(self.pdf_path) total_pages = pdf_document.page_count for page_number in range(total_pages): if self.stop_flag(): self.update_results.emit("Searchped.\n", []) break page = pdf_document[page_number] text = page.get_text("text") for name in self.names: if re.search(rf'\b{name.lower()}\b', text.lower()): if name.strip(): parts = text.lower().split(name.lower()) if len(parts) > 1: next_word_parts = parts[1].split() if next_word_parts: next_word = next_word_parts[0].rstrip(',\'') if len(re.sub(r'\W', next_word)) in {1, 2}: continue if re.match(r'\w+\.\wnext_word): continue if name.lower() in self.dictionary: continue content_in_parentheses = re.search(r'\((.*?)\)', name) if content_in_parentheses: content_word = content_in_parentheses.group(1).strip() if content_word.lower() in self.dictionary: continue if next_word.isdigit(): continue if self.is_valid_second_word(next_word) and next_word.lower() not in self.dictionary: clean_next_word = self.clean_word(next_word) full_name = f"{name} {clean_next_word}" if full_name not in self.results: self.results[full_name] = [] self.results[full_name].append(page_number + 1) # Emit the updated result self.update_results.emit(full_name, self.results[full_name]) # Update progress progress = int((page_number + 1) / total_pages * 100) self.update_progress.emit(progress) pdf_document.close() self.update_results.emit("Searchleted.\n", []) def is_valid_second_word(self, word): invalid_words = {f"{i}," for i in range(1, 32)} invalid_special_chars = {'#', '&', '-', '?', '.'} cleaned_word = ''.join(char for char in word if char.isalnum() or char in invalid_special_chars) return ( cleaned_word not in invalid_words and cleaned_word not in invalid_special_chars and len(cleaned_word) > 1 ) def clean_word(self, word): cleaned_word = ''.join(char for char in word if char.isalnum() or char in {'-', '_'}) return cleaned_word.strip() class PDFSearchApp(QMainWindow): def __init__(self): super().__init__() self.setWindowTitle("PDF Search App") self.setGeometry(100, 100, 600, 700) self.stop_search_flag = False self.found_results = {} # Stores results as {name: [pages]} self.dictionary = set() self.init_ui() def init_ui(self): self.central_widget = QWidget() self.setCentralWidget(self.central_widget) self.layout = QVBoxLayout() # File pickers self.names_file_picker = self.create_file_picker("Names File:") self.pdf_file_picker = self.create_file_picker("PDF File:") self.dictionary_file_picker = self.create_file_picker("Dictionary File:") # Search and Stop buttons self.search_button = QPushButton("Search") self.search_button.clicked.connect(self.on_search) self.stop_button = QPushButton("Stop Search") self.stop_button.clicked.connect(self.on_stop_search) # Results text area self.results_text = QTextEdit() self.results_text.setReadOnly(True) # Search exact text self.search_exact_label = QLabel("Search Exact Text:") self.search_exact_input = QLineEdit() self.search_exact_button = QPushButton("Search Exact") self.search_exact_button.clicked.connect(self.on_search_exact) # Save results button self.save_button = QPushButton("Save Results") self.save_button.clicked.connect(self.on_save_results) # Sort buttons self.sort_first_last_button = QPushButton("Sort First Name Last Name") self.sort_first_last_button.clicked.connect(self.on_sort_first_last) self.sort_last_first_button = QPushButton("Sort Last Name, First Name") self.sort_last_first_button.clicked.connect(self.on_sort_last_first) # Progress bar self.progress_bar = QProgressBar() self.progress_bar.setValue(0) # Add widgets to layout self.layout.addWidget(self.names_file_picker) self.layout.addWidget(self.dictionary_file_picker) self.layout.addWidget(self.pdf_file_picker) self.layout.addWidget(self.search_button) self.layout.addWidget(self.stop_button) self.layout.addWidget(self.results_text) self.layout.addWidget(self.search_exact_label) self.layout.addWidget(self.search_exact_input) self.layout.addWidget(self.search_exact_button) self.layout.addWidget(self.save_button) self.layout.addWidget(self.sort_first_last_button) self.layout.addWidget(self.sort_last_first_button) self.layout.addWidget(self.progress_bar) self.central_widget.setLayout(self.layout) def create_file_picker(self, label_text): layout = QHBoxLayout() label = QLabel(label_text) file_picker = QPushButton("Browse...") file_picker.clicked.connect(lambda: self.open_file_dialog(label_text)) layout.addWidget(label) layout.addWidget(file_picker) widget = QWidget() widget.setLayout(layout) return widget def open_file_dialog(self, label_text): file_path, _ = QFileDialog.getOpenFileName(self, f"Select {label_text}", "", "Text Files (*.txt);;PDF Files (*.pdf)") if file_path: if "Names" in label_text: self.names_file_path = file_path elif "PDF" in label_text: self.pdf_file_path = file_path elif "Dictionary" in label_text: self.dictionary_file_path = file_path def on_search(self): self.stop_search_flag = False self.found_results.clear() self.results_text.clear() if not hasattr(self, 'pdf_file_path'): QMessageBox.critical(self, "Error", "Please select a PDF file.") return names = self.load_names(self.names_file_path) self.load_dictionary(self.dictionary_file_path) self.search_thread = SearchThread(self.pdf_file_path, names, self.dictionary, lambda: self.stop_search_flag) self.search_thread.update_progress.connect(self.progress_bar.setValue) self.search_thread.update_results.connect(self.update_results_text) self.search_thread.start() def on_stop_search(self): self.stop_search_flag = True def on_search_exact(self): self.stop_search_flag = False self.found_results.clear() self.results_text.clear() if not hasattr(self, 'pdf_file_path'): QMessageBox.critical(self, "Error", "Please select a PDF file.") return exact_text = self.search_exact_input.text().strip() if not exact_text: QMessageBox.critical(self, "Error", "Please enter exact text to search.") return self.search_thread = SearchThread(self.pdf_file_path, [exact_text], self.dictionary, lambda: self.stop_search_flag) self.search_thread.update_progress.connect(self.progress_bar.setValue) self.search_thread.update_results.connect(self.update_results_text) self.search_thread.start() def on_save_results(self): file_path, _ = QFileDialog.getSaveFileName(self, "Save Results", "", "Text Files (*.txt)") if file_path: self.save_results_to_file(file_path) QMessageBox.information(self, "Info", f"Results saved successfully to:\n{file_path}") def on_sort_first_last(self): self.sort_results("first_last") def on_sort_last_first(self): self.sort_results("last_first") def sort_results(self, sort_order): sorted_results = sorted(self.found_results.items(), key=lambda x: self.get_sort_key(x[0], sort_order)) # Clear the results dictionary and update with the sorted names new_results = {} self.results_text.clear() for result_name, result_pages in sorted_results: if sort_order == "last_first": parts = result_name.split() if len(parts) > 1: new_name = f"{parts[-1]} {parts[0]}" # Rewriting the name as "Last First" else: new_name = parts[0] # If there's only one part, keep it unchanged else: new_name = result_name # Keep original order new_results[new_name] = result_pages self.results_text.append(f"{new_name}: {', '.join(map(str, result_pages))}") # Replace found_results with updated names self.found_results = new_results def get_sort_key(self, name, sort_order): parts = name.split() if sort_order == "first_last": return ' '.join(parts) elif sort_order == "last_first": if len(parts) > 1: return f"{parts[-1]}, {' '.join(parts[:-1])}" else: return parts[0] return name def load_names(self, file_path): with open(file_path, 'r', encoding='utf-8') as file: return [name.strip() for name in file] def load_dictionary(self, file_path): with open(file_path, 'r', encoding='utf-8') as file: self.dictionary = {line.strip().split()[0].lower() for line in file if line.strip()} def save_results_to_file(self, file_path): sorted_results = sorted(self.found_results.items(), key=lambda x: x[0].lower()) with open(file_path, "w", encoding="utf-8") as file: for name, pages in sorted_results: file.write(f"{name, '.join(map(str, pages))}\n") def update_results_text(self, name, pages): if name in ["Starting search...\n", "Search stopped.\n", "Search completed.\n"]: self.results_text.append(name) return # Ensure last name is capitalized parts = name.split() if len(parts) > 1: formatted_name = f"{parts[0]} {parts[1].capitalize()}" # Capitalize last name else: formatted_name = name.capitalize() # If single name, just capitalize it # Update found_results dictionary if formatted_name not in self.found_results: self.found_results[formatted_name] = [] self.found_results[formatted_name] = pages # Clear and rebuild results text with updated capitalization self.results_text.clear() sorted_results = sorted(self.found_results.items(), key=lambda x: x[0].lower()) for result_name, result_pages in sorted_results: self.results_text.append(f"{result_name}: {', '.join(map(str, result_pages))}") if __name__ == '__main__': app = QApplication(sys.argv) window = PDFSearchApp() window.show() sys.exit(app.exec_())import sys from PyQt5.QtWidgets import ( QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, QFileDialog, QPushButton, QTextEdit, QLineEdit, QLabel, QProgressBar, QMessageBox, ) from PyQt5.QtCore import Qt, QThread, pyqtSignal import fitz # PyMuPDF import re class SearchThread(QThread): update_progress = pyqtSignal(int) # Signal to update progress bar update_results = pyqtSignal(str, list) # Signal to update results text (name, pages) def __init__(self, pdf_path, names, dictionary, stop_flag): super().__init__() self.pdf_path = pdf_path self.names = names self.dictionary = dictionary self.stop_flag = stop_flag self.results = {} # Stores results as {name: [pages]} def run(self): self.update_results.emit("Startingch...\n", []) pdf_document = fitz.open(self.pdf_path) total_pages = pdf_document.page_count for page_number in range(total_pages): if self.stop_flag(): self.update_results.emit("Searchped.\n", []) break page = pdf_document[page_number] text = page.get_text("text") for name in self.names: if re.search(rf'\b{name.lower()}\b', text.lower()): if name.strip(): parts = text.lower().split(name.lower()) if len(parts) > 1: next_word_parts = parts[1].split() if next_word_parts: next_word = next_word_parts[0].rstrip(',\'') if len(re.sub(r'\W', next_word)) in {1, 2}: continue if re.match(r'\w+\.\wnext_word): continue if name.lower() in self.dictionary: continue content_in_parentheses = re.search(r'\((.*?)\)', name) if content_in_parentheses: content_word = content_in_parentheses.group(1).strip() if content_word.lower() in self.dictionary: continue if next_word.isdigit(): continue if self.is_valid_second_word(next_word) and next_word.lower() not in self.dictionary: clean_next_word = self.clean_word(next_word) full_name = f"{name} {clean_next_word}" if full_name not in self.results: self.results[full_name] = [] self.results[full_name].append(page_number + 1) # Emit the updated result self.update_results.emit(full_name, self.results[full_name]) # Update progress progress = int((page_number + 1) / total_pages * 100) self.update_progress.emit(progress) pdf_document.close() self.update_results.emit("Searchleted.\n", []) def is_valid_second_word(self, word): invalid_words = {f"{i}," for i in range(1, 32)} invalid_special_chars = {'#', '&', '-', '?', '.'} cleaned_word = ''.join(char for char in word if char.isalnum() or char in invalid_special_chars) return ( cleaned_word not in invalid_words and cleaned_word not in invalid_special_chars and len(cleaned_word) > 1 ) def clean_word(self, word): cleaned_word = ''.join(char for char in word if char.isalnum() or char in {'-', '_'}) return cleaned_word.strip() class PDFSearchApp(QMainWindow): def __init__(self): super().__init__() self.setWindowTitle("PDF Search App") self.setGeometry(100, 100, 600, 700) self.stop_search_flag = False self.found_results = {} # Stores results as {name: [pages]} self.dictionary = set() self.init_ui() def init_ui(self): self.central_widget = QWidget() self.setCentralWidget(self.central_widget) self.layout = QVBoxLayout() # File pickers self.names_file_picker = self.create_file_picker("Names File:") self.pdf_file_picker = self.create_file_picker("PDF File:") self.dictionary_file_picker = self.create_file_picker("Dictionary File:") # Search and Stop buttons self.search_button = QPushButton("Search") self.search_button.clicked.connect(self.on_search) self.stop_button = QPushButton("Stop Search") self.stop_button.clicked.connect(self.on_stop_search) # Results text area self.results_text = QTextEdit() self.results_text.setReadOnly(True) # Search exact text self.search_exact_label = QLabel("Search Exact Text:") self.search_exact_input = QLineEdit() self.search_exact_button = QPushButton("Search Exact") self.search_exact_button.clicked.connect(self.on_search_exact) # Save results button self.save_button = QPushButton("Save Results") self.save_button.clicked.connect(self.on_save_results) # Sort buttons self.sort_first_last_button = QPushButton("Sort First Name Last Name") self.sort_first_last_button.clicked.connect(self.on_sort_first_last) self.sort_last_first_button = QPushButton("Sort Last Name, First Name") self.sort_last_first_button.clicked.connect(self.on_sort_last_first) # Progress bar self.progress_bar = QProgressBar() self.progress_bar.setValue(0) # Add widgets to layout self.layout.addWidget(self.names_file_picker) self.layout.addWidget(self.pdf_file_picker) self.layout.addWidget(self.dictionary_file_picker) self.layout.addWidget(self.search_button) self.layout.addWidget(self.stop_button) self.layout.addWidget(self.results_text) self.layout.addWidget(self.search_exact_label) self.layout.addWidget(self.search_exact_input) self.layout.addWidget(self.search_exact_button) self.layout.addWidget(self.save_button) self.layout.addWidget(self.sort_first_last_button) self.layout.addWidget(self.sort_last_first_button) self.layout.addWidget(self.progress_bar) self.central_widget.setLayout(self.layout) def create_file_picker(self, label_text): layout = QHBoxLayout() label = QLabel(label_text) file_picker = QPushButton("Browse...") file_picker.clicked.connect(lambda: self.open_file_dialog(label_text)) layout.addWidget(label) layout.addWidget(file_picker) widget = QWidget() widget.setLayout(layout) return widget def open_file_dialog(self, label_text): file_path, _ = QFileDialog.getOpenFileName(self, f"Select {label_text}", "", "Text Files (*.txt);;PDF Files (*.pdf)") if file_path: if "Names" in label_text: self.names_file_path = file_path elif "PDF" in label_text: self.pdf_file_path = file_path elif "Dictionary" in label_text: self.dictionary_file_path = file_path def on_search(self): self.stop_search_flag = False self.found_results.clear() self.results_text.clear() if not hasattr(self, 'pdf_file_path'): QMessageBox.critical(self, "Error", "Please select a PDF file.") return names = self.load_names(self.names_file_path) self.load_dictionary(self.dictionary_file_path) self.search_thread = SearchThread(self.pdf_file_path, names, self.dictionary, lambda: self.stop_search_flag) self.search_thread.update_progress.connect(self.progress_bar.setValue) self.search_thread.update_results.connect(self.update_results_text) self.search_thread.start() def on_stop_search(self): self.stop_search_flag = True def on_search_exact(self): self.stop_search_flag = False self.found_results.clear() self.results_text.clear() if not hasattr(self, 'pdf_file_path'): QMessageBox.critical(self, "Error", "Please select a PDF file.") return exact_text = self.search_exact_input.text().strip() if not exact_text: QMessageBox.critical(self, "Error", "Please enter exact text to search.") return self.search_thread = SearchThread(self.pdf_file_path, [exact_text], self.dictionary, lambda: self.stop_search_flag) self.search_thread.update_progress.connect(self.progress_bar.setValue) self.search_thread.update_results.connect(self.update_results_text) self.search_thread.start() def on_save_results(self): file_path, _ = QFileDialog.getSaveFileName(self, "Save Results", "", "Text Files (*.txt)") if file_path: self.save_results_to_file(file_path) QMessageBox.information(self, "Info", f"Results saved successfully to:\n{file_path}") def on_sort_first_last(self): self.sort_results("first_last") def on_sort_last_first(self): self.sort_results("last_first") def sort_results(self, sort_order): sorted_results = sorted(self.found_results.items(), key=lambda x: self.get_sort_key(x[0], sort_order)) # Clear the results dictionary and update with the sorted names new_results = {} self.results_text.clear() for result_name, result_pages in sorted_results: if sort_order == "last_first": parts = result_name.split() if len(parts) > 1: new_name = f"{parts[-1]} {parts[0]}" # Rewriting the name as "Last First" else: new_name = parts[0] # If there's only one part, keep it unchanged else: new_name = result_name # Keep original order new_results[new_name] = result_pages self.results_text.append(f"{new_name}: {', '.join(map(str, result_pages))}") # Replace found_results with updated names self.found_results = new_results def get_sort_key(self, name, sort_order): parts = name.split() if sort_order == "first_last": return ' '.join(parts) elif sort_order == "last_first": if len(parts) > 1: return f"{parts[-1]}, {' '.join(parts[:-1])}" else: return parts[0] return name def load_names(self, file_path): with open(file_path, 'r', encoding='utf-8') as file: return [name.strip() for name in file] def load_dictionary(self, file_path): with open(file_path, 'r', encoding='utf-8') as file: self.dictionary = {line.strip().split()[0].lower() for line in file if line.strip()} def save_results_to_file(self, file_path): sorted_results = sorted(self.found_results.items(), key=lambda x: x[0].lower()) with open(file_path, "w", encoding="utf-8") as file: for name, pages in sorted_results: file.write(f"{name, '.join(map(str, pages))}\n") def update_results_text(self, name, pages): if name in ["Starting search...\n", "Search stopped.\n", "Search completed.\n"]: self.results_text.append(name) return # Ensure last name is capitalized parts = name.split() if len(parts) > 1: formatted_name = f"{parts[0]} {parts[1].capitalize()}" # Capitalize last name else: formatted_name = name.capitalize() # If single name, just capitalize it # Update found_results dictionary if formatted_name not in self.found_results: self.found_results[formatted_name] = [] self.found_results[formatted_name] = pages # Clear and rebuild results text with updated capitalization self.results_text.clear() sorted_results = sorted(self.found_results.items(), key=lambda x: x[0].lower()) for result_name, result_pages in sorted_results: self.results_text.append(f"{result_name}: {', '.join(map(str, result_pages))}") if __name__ == '__main__': app = QApplication(sys.argv) window = PDFSearchApp() window.show() sys.exit(app.exec_()) goretsky, +Zlip792, xrobwx71 and 1 other 3 1 Share Link to comment https://www.neowin.net/forum/topic/1437118-pdf-name-search-python-app/ Share on other sites More sharing options...
+Warwagon MVC Posted February 27 Author MVC Share Posted February 27 Updated. hellowalkman and xrobwx71 2 Share Link to comment https://www.neowin.net/forum/topic/1437118-pdf-name-search-python-app/#findComment-598969888 Share on other sites More sharing options...
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now