Recommended Posts

Updated the Python PDF name search app. 

  1. Give it a TXT file with a list of first names.
  2. Give it a dictionary full of english words.
  3. Give it a PDF to search

It looks through the PDF for a first name match. Then, it takes the next word following the first name and checks the English dictionary to see if it is an English word.

As an example, if it finds "Scarlett drove the car to the property," it would find "drove" in the dictionary and skip it. If it finds "Scarlett Wilberding drove the car to the property," it takes the word "Wilberding" and sees that it's not in the dictionary and then records "Scarlett Wilberding" along with the page number it was found on behind it.

Each name is recorded once, and the page numbers on which it appears show up in the back of its name.

I updated the app for the Epstein list, which is supposed to drop today..

In the example below, I used the one of the public Epstein. Court document

You can either copy pasted code or grab the .py file and names.txt and dictionary.txt

From this dropbox link
https://www.dropbox.com/scl/fo/06zqmpi5xstjh5w7vawdj/ANmedXX0X4BkNEoHtTQ1c50?rlkey=53892676fmz51ua83elhnzk0a&st=5qrqjsfj&dl=0

image.thumb.png.506317de17d38d8c1b8aa8dfa96a2fad.png

 

import sys
from PyQt5.QtWidgets import (
    QApplication,
    QMainWindow,
    QWidget,
    QVBoxLayout,
    QHBoxLayout,
    QFileDialog,
    QPushButton,
    QTextEdit,
    QLineEdit,
    QLabel,
    QProgressBar,
    QMessageBox,
)
from PyQt5.QtCore import Qt, QThread, pyqtSignal
import fitz  # PyMuPDF
import re


class SearchThread(QThread):
    update_progress = pyqtSignal(int)  # Signal to update progress bar
    update_results = pyqtSignal(str, list)  # Signal to update results text (name, pages)

    def __init__(self, pdf_path, names, dictionary, stop_flag):
        super().__init__()
        self.pdf_path = pdf_path
        self.names = names
        self.dictionary = dictionary
        self.stop_flag = stop_flag
        self.results = {}  # Stores results as {name: [pages]}

    def run(self):
        self.update_results.emit("Startingch...\n", [])
        pdf_document = fitz.open(self.pdf_path)
        total_pages = pdf_document.page_count

        for page_number in range(total_pages):
            if self.stop_flag():
                self.update_results.emit("Searchped.\n", [])
                break

            page = pdf_document[page_number]
            text = page.get_text("text")

            for name in self.names:
                if re.search(rf'\b{name.lower()}\b', text.lower()):
                    if name.strip():
                        parts = text.lower().split(name.lower())
                        if len(parts) > 1:
                            next_word_parts = parts[1].split()
                            if next_word_parts:
                                next_word = next_word_parts[0].rstrip(',\'')
                                if len(re.sub(r'\W', next_word)) in {1, 2}:
                                    continue
                                if re.match(r'\w+\.\wnext_word):
                                    continue
                                if name.lower() in self.dictionary:
                                    continue
                                content_in_parentheses = re.search(r'\((.*?)\)', name)
                                if content_in_parentheses:
                                    content_word = content_in_parentheses.group(1).strip()
                                    if content_word.lower() in self.dictionary:
                                        continue
                                if next_word.isdigit():
                                    continue
                                if self.is_valid_second_word(next_word) and next_word.lower() not in self.dictionary:
                                    clean_next_word = self.clean_word(next_word)
                                    full_name = f"{name} {clean_next_word}"
                                    if full_name not in self.results:
                                        self.results[full_name] = []
                                    self.results[full_name].append(page_number + 1)
                                    # Emit the updated result
                                    self.update_results.emit(full_name, self.results[full_name])

            # Update progress
            progress = int((page_number + 1) / total_pages * 100)
            self.update_progress.emit(progress)

        pdf_document.close()
        self.update_results.emit("Searchleted.\n", [])

    def is_valid_second_word(self, word):
        invalid_words = {f"{i}," for i in range(1, 32)}
        invalid_special_chars = {'#', '&', '-', '?', '.'}
        cleaned_word = ''.join(char for char in word if char.isalnum() or char in invalid_special_chars)
        return (
            cleaned_word not in invalid_words
            and cleaned_word not in invalid_special_chars
            and len(cleaned_word) > 1
        )

    def clean_word(self, word):
        cleaned_word = ''.join(char for char in word if char.isalnum() or char in {'-', '_'})
        return cleaned_word.strip()


class PDFSearchApp(QMainWindow):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("PDF Search App")
        self.setGeometry(100, 100, 600, 700)

        self.stop_search_flag = False
        self.found_results = {}  # Stores results as {name: [pages]}
        self.dictionary = set()

        self.init_ui()

    def init_ui(self):
        self.central_widget = QWidget()
        self.setCentralWidget(self.central_widget)

        self.layout = QVBoxLayout()

        # File pickers
        self.names_file_picker = self.create_file_picker("Names File:")
        self.pdf_file_picker = self.create_file_picker("PDF File:")
        self.dictionary_file_picker = self.create_file_picker("Dictionary File:")

        # Search and Stop buttons
        self.search_button = QPushButton("Search")
        self.search_button.clicked.connect(self.on_search)
        self.stop_button = QPushButton("Stop Search")
        self.stop_button.clicked.connect(self.on_stop_search)

        # Results text area
        self.results_text = QTextEdit()
        self.results_text.setReadOnly(True)

        # Search exact text
        self.search_exact_label = QLabel("Search Exact Text:")
        self.search_exact_input = QLineEdit()
        self.search_exact_button = QPushButton("Search Exact")
        self.search_exact_button.clicked.connect(self.on_search_exact)

        # Save results button
        self.save_button = QPushButton("Save Results")
        self.save_button.clicked.connect(self.on_save_results)

        # Sort buttons
        self.sort_first_last_button = QPushButton("Sort First Name Last Name")
        self.sort_first_last_button.clicked.connect(self.on_sort_first_last)
        self.sort_last_first_button = QPushButton("Sort Last Name, First Name")
        self.sort_last_first_button.clicked.connect(self.on_sort_last_first)

        # Progress bar
        self.progress_bar = QProgressBar()
        self.progress_bar.setValue(0)

        # Add widgets to layout
        self.layout.addWidget(self.names_file_picker)        
        self.layout.addWidget(self.dictionary_file_picker)
        self.layout.addWidget(self.pdf_file_picker)
        self.layout.addWidget(self.search_button)
        self.layout.addWidget(self.stop_button)
        self.layout.addWidget(self.results_text)
        self.layout.addWidget(self.search_exact_label)
        self.layout.addWidget(self.search_exact_input)
        self.layout.addWidget(self.search_exact_button)
        self.layout.addWidget(self.save_button)
        self.layout.addWidget(self.sort_first_last_button)
        self.layout.addWidget(self.sort_last_first_button)
        self.layout.addWidget(self.progress_bar)

        self.central_widget.setLayout(self.layout)

    def create_file_picker(self, label_text):
        layout = QHBoxLayout()
        label = QLabel(label_text)
        file_picker = QPushButton("Browse...")
        file_picker.clicked.connect(lambda: self.open_file_dialog(label_text))
        layout.addWidget(label)
        layout.addWidget(file_picker)
        widget = QWidget()
        widget.setLayout(layout)
        return widget

    def open_file_dialog(self, label_text):
        file_path, _ = QFileDialog.getOpenFileName(self, f"Select {label_text}", "", "Text Files (*.txt);;PDF Files (*.pdf)")
        if file_path:
            if "Names" in label_text:
                self.names_file_path = file_path
            elif "PDF" in label_text:
                self.pdf_file_path = file_path
            elif "Dictionary" in label_text:
                self.dictionary_file_path = file_path

    def on_search(self):
        self.stop_search_flag = False
        self.found_results.clear()
        self.results_text.clear()

        if not hasattr(self, 'pdf_file_path'):
            QMessageBox.critical(self, "Error", "Please select a PDF file.")
            return

        names = self.load_names(self.names_file_path)
        self.load_dictionary(self.dictionary_file_path)

        self.search_thread = SearchThread(self.pdf_file_path, names, self.dictionary, lambda: self.stop_search_flag)
        self.search_thread.update_progress.connect(self.progress_bar.setValue)
        self.search_thread.update_results.connect(self.update_results_text)
        self.search_thread.start()

    def on_stop_search(self):
        self.stop_search_flag = True

    def on_search_exact(self):
        self.stop_search_flag = False
        self.found_results.clear()
        self.results_text.clear()

        if not hasattr(self, 'pdf_file_path'):
            QMessageBox.critical(self, "Error", "Please select a PDF file.")
            return

        exact_text = self.search_exact_input.text().strip()
        if not exact_text:
            QMessageBox.critical(self, "Error", "Please enter exact text to search.")
            return

        self.search_thread = SearchThread(self.pdf_file_path, [exact_text], self.dictionary, lambda: self.stop_search_flag)
        self.search_thread.update_progress.connect(self.progress_bar.setValue)
        self.search_thread.update_results.connect(self.update_results_text)
        self.search_thread.start()

    def on_save_results(self):
        file_path, _ = QFileDialog.getSaveFileName(self, "Save Results", "", "Text Files (*.txt)")
        if file_path:
            self.save_results_to_file(file_path)
            QMessageBox.information(self, "Info", f"Results saved successfully to:\n{file_path}")

    def on_sort_first_last(self):
        self.sort_results("first_last")

    def on_sort_last_first(self):
        self.sort_results("last_first")

    def sort_results(self, sort_order):
        sorted_results = sorted(self.found_results.items(), key=lambda x: self.get_sort_key(x[0], sort_order))

        # Clear the results dictionary and update with the sorted names
        new_results = {}
        self.results_text.clear()
        
        for result_name, result_pages in sorted_results:
            if sort_order == "last_first":
                parts = result_name.split()
                if len(parts) > 1:
                    new_name = f"{parts[-1]} {parts[0]}"  # Rewriting the name as "Last First"
                else:
                    new_name = parts[0]  # If there's only one part, keep it unchanged
            else:
                new_name = result_name  # Keep original order

            new_results[new_name] = result_pages
            self.results_text.append(f"{new_name}: {', '.join(map(str, result_pages))}")

        # Replace found_results with updated names
        self.found_results = new_results


    def get_sort_key(self, name, sort_order):
        parts = name.split()
        if sort_order == "first_last":
            return ' '.join(parts)
        elif sort_order == "last_first":
            if len(parts) > 1:
                return f"{parts[-1]}, {' '.join(parts[:-1])}"
            else:
                return parts[0]
        return name

    def load_names(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return [name.strip() for name in file]

    def load_dictionary(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            self.dictionary = {line.strip().split()[0].lower() for line in file if line.strip()}

    def save_results_to_file(self, file_path):
        sorted_results = sorted(self.found_results.items(), key=lambda x: x[0].lower())
        with open(file_path, "w", encoding="utf-8") as file:
            for name, pages in sorted_results:
                file.write(f"{name, '.join(map(str, pages))}\n")

    def update_results_text(self, name, pages):
        if name in ["Starting search...\n", "Search stopped.\n", "Search completed.\n"]:
            self.results_text.append(name)
            return

        # Ensure last name is capitalized
        parts = name.split()
        if len(parts) > 1:
            formatted_name = f"{parts[0]} {parts[1].capitalize()}"  # Capitalize last name
        else:
            formatted_name = name.capitalize()  # If single name, just capitalize it

        # Update found_results dictionary
        if formatted_name not in self.found_results:
            self.found_results[formatted_name] = []
        self.found_results[formatted_name] = pages

        # Clear and rebuild results text with updated capitalization
        self.results_text.clear()
        sorted_results = sorted(self.found_results.items(), key=lambda x: x[0].lower())
        for result_name, result_pages in sorted_results:
            self.results_text.append(f"{result_name}: {', '.join(map(str, result_pages))}")



if __name__ == '__main__':
    app = QApplication(sys.argv)
    window = PDFSearchApp()
    window.show()
    sys.exit(app.exec_())import sys
from PyQt5.QtWidgets import (
    QApplication,
    QMainWindow,
    QWidget,
    QVBoxLayout,
    QHBoxLayout,
    QFileDialog,
    QPushButton,
    QTextEdit,
    QLineEdit,
    QLabel,
    QProgressBar,
    QMessageBox,
)
from PyQt5.QtCore import Qt, QThread, pyqtSignal
import fitz  # PyMuPDF
import re


class SearchThread(QThread):
    update_progress = pyqtSignal(int)  # Signal to update progress bar
    update_results = pyqtSignal(str, list)  # Signal to update results text (name, pages)

    def __init__(self, pdf_path, names, dictionary, stop_flag):
        super().__init__()
        self.pdf_path = pdf_path
        self.names = names
        self.dictionary = dictionary
        self.stop_flag = stop_flag
        self.results = {}  # Stores results as {name: [pages]}

    def run(self):
        self.update_results.emit("Startingch...\n", [])
        pdf_document = fitz.open(self.pdf_path)
        total_pages = pdf_document.page_count

        for page_number in range(total_pages):
            if self.stop_flag():
                self.update_results.emit("Searchped.\n", [])
                break

            page = pdf_document[page_number]
            text = page.get_text("text")

            for name in self.names:
                if re.search(rf'\b{name.lower()}\b', text.lower()):
                    if name.strip():
                        parts = text.lower().split(name.lower())
                        if len(parts) > 1:
                            next_word_parts = parts[1].split()
                            if next_word_parts:
                                next_word = next_word_parts[0].rstrip(',\'')
                                if len(re.sub(r'\W', next_word)) in {1, 2}:
                                    continue
                                if re.match(r'\w+\.\wnext_word):
                                    continue
                                if name.lower() in self.dictionary:
                                    continue
                                content_in_parentheses = re.search(r'\((.*?)\)', name)
                                if content_in_parentheses:
                                    content_word = content_in_parentheses.group(1).strip()
                                    if content_word.lower() in self.dictionary:
                                        continue
                                if next_word.isdigit():
                                    continue
                                if self.is_valid_second_word(next_word) and next_word.lower() not in self.dictionary:
                                    clean_next_word = self.clean_word(next_word)
                                    full_name = f"{name} {clean_next_word}"
                                    if full_name not in self.results:
                                        self.results[full_name] = []
                                    self.results[full_name].append(page_number + 1)
                                    # Emit the updated result
                                    self.update_results.emit(full_name, self.results[full_name])

            # Update progress
            progress = int((page_number + 1) / total_pages * 100)
            self.update_progress.emit(progress)

        pdf_document.close()
        self.update_results.emit("Searchleted.\n", [])

    def is_valid_second_word(self, word):
        invalid_words = {f"{i}," for i in range(1, 32)}
        invalid_special_chars = {'#', '&', '-', '?', '.'}
        cleaned_word = ''.join(char for char in word if char.isalnum() or char in invalid_special_chars)
        return (
            cleaned_word not in invalid_words
            and cleaned_word not in invalid_special_chars
            and len(cleaned_word) > 1
        )

    def clean_word(self, word):
        cleaned_word = ''.join(char for char in word if char.isalnum() or char in {'-', '_'})
        return cleaned_word.strip()


class PDFSearchApp(QMainWindow):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("PDF Search App")
        self.setGeometry(100, 100, 600, 700)

        self.stop_search_flag = False
        self.found_results = {}  # Stores results as {name: [pages]}
        self.dictionary = set()

        self.init_ui()

    def init_ui(self):
        self.central_widget = QWidget()
        self.setCentralWidget(self.central_widget)

        self.layout = QVBoxLayout()

        # File pickers
        self.names_file_picker = self.create_file_picker("Names File:")
        self.pdf_file_picker = self.create_file_picker("PDF File:")
        self.dictionary_file_picker = self.create_file_picker("Dictionary File:")

        # Search and Stop buttons
        self.search_button = QPushButton("Search")
        self.search_button.clicked.connect(self.on_search)
        self.stop_button = QPushButton("Stop Search")
        self.stop_button.clicked.connect(self.on_stop_search)

        # Results text area
        self.results_text = QTextEdit()
        self.results_text.setReadOnly(True)

        # Search exact text
        self.search_exact_label = QLabel("Search Exact Text:")
        self.search_exact_input = QLineEdit()
        self.search_exact_button = QPushButton("Search Exact")
        self.search_exact_button.clicked.connect(self.on_search_exact)

        # Save results button
        self.save_button = QPushButton("Save Results")
        self.save_button.clicked.connect(self.on_save_results)

        # Sort buttons
        self.sort_first_last_button = QPushButton("Sort First Name Last Name")
        self.sort_first_last_button.clicked.connect(self.on_sort_first_last)
        self.sort_last_first_button = QPushButton("Sort Last Name, First Name")
        self.sort_last_first_button.clicked.connect(self.on_sort_last_first)

        # Progress bar
        self.progress_bar = QProgressBar()
        self.progress_bar.setValue(0)

        # Add widgets to layout
        self.layout.addWidget(self.names_file_picker)
        self.layout.addWidget(self.pdf_file_picker)
        self.layout.addWidget(self.dictionary_file_picker)
        self.layout.addWidget(self.search_button)
        self.layout.addWidget(self.stop_button)
        self.layout.addWidget(self.results_text)
        self.layout.addWidget(self.search_exact_label)
        self.layout.addWidget(self.search_exact_input)
        self.layout.addWidget(self.search_exact_button)
        self.layout.addWidget(self.save_button)
        self.layout.addWidget(self.sort_first_last_button)
        self.layout.addWidget(self.sort_last_first_button)
        self.layout.addWidget(self.progress_bar)

        self.central_widget.setLayout(self.layout)

    def create_file_picker(self, label_text):
        layout = QHBoxLayout()
        label = QLabel(label_text)
        file_picker = QPushButton("Browse...")
        file_picker.clicked.connect(lambda: self.open_file_dialog(label_text))
        layout.addWidget(label)
        layout.addWidget(file_picker)
        widget = QWidget()
        widget.setLayout(layout)
        return widget

    def open_file_dialog(self, label_text):
        file_path, _ = QFileDialog.getOpenFileName(self, f"Select {label_text}", "", "Text Files (*.txt);;PDF Files (*.pdf)")
        if file_path:
            if "Names" in label_text:
                self.names_file_path = file_path
            elif "PDF" in label_text:
                self.pdf_file_path = file_path
            elif "Dictionary" in label_text:
                self.dictionary_file_path = file_path

    def on_search(self):
        self.stop_search_flag = False
        self.found_results.clear()
        self.results_text.clear()

        if not hasattr(self, 'pdf_file_path'):
            QMessageBox.critical(self, "Error", "Please select a PDF file.")
            return

        names = self.load_names(self.names_file_path)
        self.load_dictionary(self.dictionary_file_path)

        self.search_thread = SearchThread(self.pdf_file_path, names, self.dictionary, lambda: self.stop_search_flag)
        self.search_thread.update_progress.connect(self.progress_bar.setValue)
        self.search_thread.update_results.connect(self.update_results_text)
        self.search_thread.start()

    def on_stop_search(self):
        self.stop_search_flag = True

    def on_search_exact(self):
        self.stop_search_flag = False
        self.found_results.clear()
        self.results_text.clear()

        if not hasattr(self, 'pdf_file_path'):
            QMessageBox.critical(self, "Error", "Please select a PDF file.")
            return

        exact_text = self.search_exact_input.text().strip()
        if not exact_text:
            QMessageBox.critical(self, "Error", "Please enter exact text to search.")
            return

        self.search_thread = SearchThread(self.pdf_file_path, [exact_text], self.dictionary, lambda: self.stop_search_flag)
        self.search_thread.update_progress.connect(self.progress_bar.setValue)
        self.search_thread.update_results.connect(self.update_results_text)
        self.search_thread.start()

    def on_save_results(self):
        file_path, _ = QFileDialog.getSaveFileName(self, "Save Results", "", "Text Files (*.txt)")
        if file_path:
            self.save_results_to_file(file_path)
            QMessageBox.information(self, "Info", f"Results saved successfully to:\n{file_path}")

    def on_sort_first_last(self):
        self.sort_results("first_last")

    def on_sort_last_first(self):
        self.sort_results("last_first")

    def sort_results(self, sort_order):
        sorted_results = sorted(self.found_results.items(), key=lambda x: self.get_sort_key(x[0], sort_order))

        # Clear the results dictionary and update with the sorted names
        new_results = {}
        self.results_text.clear()
        
        for result_name, result_pages in sorted_results:
            if sort_order == "last_first":
                parts = result_name.split()
                if len(parts) > 1:
                    new_name = f"{parts[-1]} {parts[0]}"  # Rewriting the name as "Last First"
                else:
                    new_name = parts[0]  # If there's only one part, keep it unchanged
            else:
                new_name = result_name  # Keep original order

            new_results[new_name] = result_pages
            self.results_text.append(f"{new_name}: {', '.join(map(str, result_pages))}")

        # Replace found_results with updated names
        self.found_results = new_results


    def get_sort_key(self, name, sort_order):
        parts = name.split()
        if sort_order == "first_last":
            return ' '.join(parts)
        elif sort_order == "last_first":
            if len(parts) > 1:
                return f"{parts[-1]}, {' '.join(parts[:-1])}"
            else:
                return parts[0]
        return name

    def load_names(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return [name.strip() for name in file]

    def load_dictionary(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            self.dictionary = {line.strip().split()[0].lower() for line in file if line.strip()}

    def save_results_to_file(self, file_path):
        sorted_results = sorted(self.found_results.items(), key=lambda x: x[0].lower())
        with open(file_path, "w", encoding="utf-8") as file:
            for name, pages in sorted_results:
                file.write(f"{name, '.join(map(str, pages))}\n")

    def update_results_text(self, name, pages):
        if name in ["Starting search...\n", "Search stopped.\n", "Search completed.\n"]:
            self.results_text.append(name)
            return

        # Ensure last name is capitalized
        parts = name.split()
        if len(parts) > 1:
            formatted_name = f"{parts[0]} {parts[1].capitalize()}"  # Capitalize last name
        else:
            formatted_name = name.capitalize()  # If single name, just capitalize it

        # Update found_results dictionary
        if formatted_name not in self.found_results:
            self.found_results[formatted_name] = []
        self.found_results[formatted_name] = pages

        # Clear and rebuild results text with updated capitalization
        self.results_text.clear()
        sorted_results = sorted(self.found_results.items(), key=lambda x: x[0].lower())
        for result_name, result_pages in sorted_results:
            self.results_text.append(f"{result_name}: {', '.join(map(str, result_pages))}")



if __name__ == '__main__':
    app = QApplication(sys.argv)
    window = PDFSearchApp()
    window.show()
    sys.exit(app.exec_())

 

 

Link to comment
https://www.neowin.net/forum/topic/1437118-pdf-name-search-python-app/
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
  • Recently Browsing   0 members

    • No registered users viewing this page.