Recommended Posts

import wx
import fitz  # PyMuPDF
import string
import re

class PDFSearchApp(wx.Frame):
    def __init__(self, *args, **kw):
        super(PDFSearchApp, self).__init__(*args, **kw)

        self.panel = wx.Panel(self)
        self.create_widgets()

        self.stop_search_flag = False  # Flag to signal stop search
        self.found_results = []  # Store found results
        self.dictionary = set()  # Set to store dictionary words

    def create_widgets(self):
        self.file_picker_names = wx.FilePickerCtrl(self.panel, message="Choose the Names TXT file:")
        self.file_picker_pdf = wx.FilePickerCtrl(self.panel, message="Choose the PDF file:")
        self.file_picker_dictionary = wx.FilePickerCtrl(self.panel, message="Choose the Dictionary TXT file:")

        self.search_button = wx.Button(self.panel, label="Search", size=(80, 30))
        self.search_button.Bind(wx.EVT_BUTTON, self.on_search)

        self.stop_search_button = wx.Button(self.panel, label="Stop Search", size=(100, 30))
        self.stop_search_button.Bind(wx.EVT_BUTTON, self.on_stop_search)

        self.result_text = wx.TextCtrl(self.panel, style=wx.TE_MULTILINE | wx.TE_READONLY, size=(400, 300))

        # New components for search box
        self.search_text = wx.TextCtrl(self.panel, size=(200, -1))
        self.search_button_exact = wx.Button(self.panel, label="Search Exact", size=(120, 30))
        self.search_button_exact.Bind(wx.EVT_BUTTON, self.on_search_exact)

        # Save Results button
        self.save_button = wx.Button(self.panel, label="Save Results", size=(120, 30))
        self.save_button.Bind(wx.EVT_BUTTON, self.on_save_results)

        sizer = wx.BoxSizer(wx.VERTICAL)
        sizer.Add(wx.StaticText(self.panel, label="Names File:"), 0, wx.ALL | wx.EXPAND, 10)
        sizer.Add(self.file_picker_names, 0, wx.ALL | wx.EXPAND, 10)
        sizer.Add(wx.StaticText(self.panel, label="PDF File:"), 0, wx.ALL | wx.EXPAND, 10)
        sizer.Add(self.file_picker_pdf, 0, wx.ALL | wx.EXPAND, 10)
        sizer.Add(wx.StaticText(self.panel, label="Dictionary File:"), 0, wx.ALL | wx.EXPAND, 10)
        sizer.Add(self.file_picker_dictionary, 0, wx.ALL | wx.EXPAND, 10)
        sizer.Add(self.search_button, 0, wx.ALL | wx.CENTER, 10)
        sizer.Add(self.stop_search_button, 0, wx.ALL | wx.CENTER, 10)
        sizer.Add(self.result_text, 1, wx.ALL | wx.EXPAND, 10)

        # Search box components
        sizer.Add(wx.StaticText(self.panel, label="Search Exact Text:"), 0, wx.ALL | wx.EXPAND, 10)
        sizer.Add(self.search_text, 0, wx.ALL | wx.EXPAND, 10)
        sizer.Add(self.search_button_exact, 0, wx.ALL | wx.CENTER, 10)

        # Save Results button
        sizer.Add(self.save_button, 0, wx.ALL | wx.CENTER, 10)

        self.panel.SetSizer(sizer)

    def on_search(self, event):
        print("Search button clicked")
        self.stop_search_flag = False  # Reset the flag
        self.found_results.clear()  # Clear previous results
        names_file_path = self.file_picker_names.GetPath()
        pdf_file_path = self.file_picker_pdf.GetPath()
        dictionary_file_path = self.file_picker_dictionary.GetPath()

        print(f"Names File Path: {names_file_path}")
        print(f"PDF File Path: {pdf_file_path}")
        print(f"Dictionary File Path: {dictionary_file_path}")

        if not pdf_file_path:
            wx.MessageBox("Please select a PDF file.", "Error", wx.OK | wx.ICON_ERROR)
            return

        names = self.load_names(names_file_path)
        self.load_dictionary(dictionary_file_path)
        self.search_and_display_results(pdf_file_path, names)

    def on_stop_search(self, event):
        print("Stop Search button clicked")
        self.stop_search_flag = True
        # Do not save results automatically when the search is stopped

    def on_search_exact(self, event):
        print("Search Exact button clicked")
        self.stop_search_flag = False  # Reset the flag
        self.found_results.clear()  # Clear previous results
        pdf_file_path = self.file_picker_pdf.GetPath()
        exact_text = self.search_text.GetValue().strip()

        print(f"PDF File Path: {pdf_file_path}")
        print(f"Exact Text to Search: {exact_text}")

        if not pdf_file_path or not exact_text:
            wx.MessageBox("Please select a PDF file and enter exact text to search.", "Error", wx.OK | wx.ICON_ERROR)
            return

        self.search_exact_and_display_results(pdf_file_path, exact_text)

    def load_names(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return [name.strip() for name in file]

    def load_dictionary(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            self.dictionary = {line.strip().split()[0].lower() for line in file if line.strip()}

    def is_valid_second_word(self, word):
        invalid_words = {f"{i}," for i in range(1, 32)}
        invalid_special_chars = {'#', '&', '-', '?', '.'}
        cleaned_word = ''.join(char for char in word if char.isalnum() or char in invalid_special_chars)
        return (
            cleaned_word not in invalid_words
            and cleaned_word not in invalid_special_chars
            and len(cleaned_word) > 1
        )

    def clean_word(self, word):
        # Remove leading and trailing numbers or special characters
        cleaned_word = ''.join(char for char in word if char.isalnum() or char in {'-', '_'})
        return cleaned_word.strip()

    def search_and_display_results(self, pdf_file_path, names):
        print(f"Searching PDF file: {pdf_file_path}")
        self.result_text.Clear()

        pdf_document = fitz.open(pdf_file_path)

        for page_number in range(pdf_document.page_count):
            if self.stop_search_flag:
                print("Search stopped.")
                break

            page = pdf_document[page_number]
            text = page.get_text("text")

            for name in names:
                if re.search(rf'\b{name.lower()}\b', text.lower()):
                    if name.strip():
                        parts = text.lower().split(name.lower())
                        if len(parts) > 1:
                            next_word_parts = parts[1].split()
                            if next_word_parts:
                                next_word = next_word_parts[0].rstrip(',\'')
                                
                                # Ignore if the length of the second word has 1 or 2 characters
                                if len(re.sub(r'\W', '', next_word)) in {1, 2}:
                                    continue
                                
                                # Exclude certain patterns (e.g., domains) from being considered as names
                                if re.match(r'\w+\.\w+', next_word):
                                    continue  # Skip if the following word looks like a domain
                                
                                # Check if the whole word is in the dictionary
                                if name.lower() in self.dictionary:
                                    continue  # Ignore the entire name if it's in the dictionary
                                
                                # Check for content inside parentheses and ignore if it's in the dictionary
                                content_in_parentheses = re.search(r'\((.*?)\)', name)
                                if content_in_parentheses:
                                    content_word = content_in_parentheses.group(1).strip()
                                    if content_word.lower() in self.dictionary:
                                        continue  # Ignore the entire name if content inside parentheses is in the dictionary
                                
                                # Check if the second word is all numbers
                                if next_word.isdigit():
                                    continue  # Ignore the entire name if the second word is all numbers
                                
                                if self.is_valid_second_word(next_word) and next_word.lower() not in self.dictionary:
                                    clean_next_word = self.clean_word(next_word)
                                    result = f"{name} {clean_next_word}, Page: {page_number + 1}\n"
                                    self.result_text.AppendText(result)
                                    self.found_results.append(result)
                                    print(f"Match found: {result}")
                                    wx.Yield()
        pdf_document.close()
        # Do not save results automatically here

        print("Search completed.")

    def search_exact_and_display_results(self, pdf_file_path, exact_text):
        print(f"Searching Exact Text in PDF file: {pdf_file_path}")
        self.result_text.Clear()

        pdf_document = fitz.open(pdf_file_path)

        for page_number in range(pdf_document.page_count):
            if self.stop_search_flag:
                print("Search stopped.")
                break

            page = pdf_document[page_number]
            text = page.get_text("text")

            if re.search(rf'\b{exact_text.lower()}\b', text.lower()):
                result = f"Exact Text '{exact_text}' found on Page: {page_number + 1}\n"
                self.result_text.AppendText(result)
                self.found_results.append(result)
                print(f"Exact Text found: {result}")
                wx.Yield()

        pdf_document.close()

        # Do not save results automatically here

        print("Search completed.")

    def on_save_results(self, event):
        print("Save Results button clicked")
        self.save_results_to_file_dialog()

    def save_results_to_file_dialog(self):
        dlg = wx.FileDialog(
            self, message="Save Results As...",
            defaultDir=wx.GetHomeDir(),
            defaultFile="results.txt",
            wildcard="Text files (*.txt)|*.txt|All files (*.*)|*.*",
            style=wx.FD_SAVE | wx.FD_OVERWRITE_PROMPT
        )

        if dlg.ShowModal() == wx.ID_OK:
            file_path = dlg.GetPath()
            self.save_results_to_file(file_path)
            wx.MessageBox(f"Results saved successfully to:\n{file_path}", "Info", wx.OK | wx.ICON_INFORMATION)

        dlg.Destroy()

    def save_results_to_file(self, file_path):
        sorted_results = sorted(self.found_results, key=lambda x: x.lower())
        with open(file_path, "w", encoding="utf-8") as file:
            file.writelines(sorted_results)


if __name__ == '__main__':
    app = wx.App(False)
    frame = PDFSearchApp(None, title='PDF Search App', size=(600, 700))
    frame.Show()
    app.MainLoop()

 

image.png

image.png

  • Like 2
Link to comment
https://www.neowin.net/forum/topic/1437118-pdf-name-search-python-app/
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
  • Recently Browsing   0 members

    • No registered users viewing this page.