Source code for autowriterllm.markdown_to_pdf_converter

import logging
import tkinter as tk
from tkinter import ttk, filedialog, scrolledtext
from pathlib import Path
from typing import Optional, List, Dict, Any
import datetime
import threading
import re
import subprocess
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
import queue

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("pdf_conversion.log"), logging.StreamHandler()],
)

logger = logging.getLogger(__name__)


[docs]@dataclass
class ConverterConfig:
    """Configuration settings for the PDF converter.
    
    Attributes:
        max_workers (int): Maximum number of worker threads for parallel processing
        supported_encodings (List[str]): List of encodings to try when reading files
        temp_filename (str): Name of temporary combined markdown file
        pandoc_options (Dict[str, Any]): Additional pandoc conversion options
    """
    max_workers: int = 4
    supported_encodings: List[str] = field(default_factory=lambda: [
        "utf-8", "utf-8-sig", "gbk", "gb2312", "gb18030", "latin1"
    ])
    temp_filename: str = "combined_temp.md"
    pandoc_options: Dict[str, Any] = field(default_factory=lambda: {
        "toc_depth": 3,
        "margin": "1in",
        "highlight_style": "tango"
    })


[docs]class MarkdownToPDFConverter:
    """Converts markdown files to a single PDF document using pandoc."""

[docs]    def __init__(
        self, toc_file: Path, output_dir: Path, css_file: Optional[Path] = None,
        config: Optional[ConverterConfig] = None
    ):
        """Initialize the converter.

        Args:
            toc_file: Path to table of contents markdown file
            output_dir: Directory containing markdown files
            css_file: Optional path to custom CSS file
            config: Optional configuration settings

        Raises:
            ValueError: If toc_file or output_dir don't exist
        """
        self.toc_file = Path(toc_file)
        self.output_dir = Path(output_dir)
        self.css_file = css_file
        self.config = config or ConverterConfig()
        
        # Validate paths
        if not self.toc_file.exists():
            raise ValueError(f"TOC file not found: {self.toc_file}")
        if not self.output_dir.is_dir():
            raise ValueError(f"Invalid output directory: {self.output_dir}")
            
        # Initialize thread-safe queue for logging
        self.log_queue = queue.Queue()

[docs]    def _check_dependencies(self) -> None:
        """Check if required dependencies are installed."""
        try:
            # Check pandoc
            result = subprocess.run(
                ["pandoc", "--version"], capture_output=True, text=True
            )
            logger.info("Found pandoc installation")

            # Check wkhtmltopdf - try multiple possible locations
            wkhtmltopdf_paths = [
                "wkhtmltopdf",  # If in PATH
                r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe",
                r"C:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exe",
            ]

            for path in wkhtmltopdf_paths:
                try:
                    result = subprocess.run(
                        [path, "--version"], capture_output=True, text=True
                    )
                    self.wkhtmltopdf_path = path  # Store the working path
                    logger.info(f"Found wkhtmltopdf at: {path}")
                    break
                except FileNotFoundError:
                    continue
            else:
                raise FileNotFoundError(
                    "wkhtmltopdf not found. Please install wkhtmltopdf from: "
                    "https://wkhtmltopdf.org/downloads.html"
                )

        except FileNotFoundError as e:
            if "pandoc" in str(e):
                raise RuntimeError("pandoc not found. Please install pandoc first.")
            raise

[docs]    def _parse_toc(self) -> List[str]:
        """Parse table of contents to get ordered list of markdown files.
        
        Returns:
            List[str]: Ordered list of markdown filenames
            
        Example ordering:
            chapter-1.md
            chapter-1-1.md
            chapter-1-1-1.md
            chapter-1-2.md
            chapter-2.md
            chapter-2-1.md
        """
        try:
            logger.info("Parsing table of contents...")

            # Get all markdown files that start with "chapter-" in the directory
            all_md_files = [
                f.name for f in self.output_dir.glob("chapter-*.md")
            ]
            logger.info(f"Found {len(all_md_files)} chapter files in directory")

            if not all_md_files:
                logger.warning("No chapter files found in directory")
                return []

            # Custom sorting function for chapter files
            def chapter_sort_key(filename: str) -> tuple:
                """Create a sort key for chapter filenames.
                
                Args:
                    filename: The filename to create a sort key for
                    
                Returns:
                    A tuple of integers representing the chapter number
                    
                Example:
                    'chapter-1-2.md' -> (1, 2)
                    'chapter-1. introduction.md' -> (1,)
                """
                # Remove 'chapter-' prefix and '.md' suffix
                base = filename.replace('chapter-', '').replace('.md', '')
                
                # Extract all numbers from the start of the string
                numbers = []
                for part in base.split('-'):
                    # Extract first number from the part
                    match = re.match(r'(\d+)', part)
                    if match:
                        numbers.append(int(match.group(1)))
                    else:
                        break
                
                return tuple(numbers) if numbers else (float('inf'),)  # Return inf if no numbers found

            # Sort files using the custom sort key
            sorted_files = sorted(all_md_files, key=chapter_sort_key)
            
            logger.info("Sorted chapter files in correct order:")
            for f in sorted_files:
                logger.debug(f"  {f}")

            return sorted_files

        except Exception as e:
            logger.error(f"Error parsing table of contents: {e}")
            raise

[docs]    def _read_file_with_fallback_encoding(self, file_path: Path) -> str:
        """Read file content with fallback encodings."""
        encodings = ["utf-8-sig", "utf-8", "latin1"]  # Simplified encoding list
        
        errors = []
        for encoding in encodings:
            try:
                logger.debug(f"Trying to read {file_path} with {encoding} encoding")
                with open(file_path, 'r', encoding=encoding) as f:
                    content = f.read()
                    # Normalize line endings
                    content = content.replace('\r\n', '\n')
                    return content
            except UnicodeError as e:
                errors.append(f"{encoding}: {str(e)}")
                continue

        error_msg = f"Failed to read {file_path} with any supported encoding:\n" + "\n".join(errors)
        logger.error(error_msg)
        raise UnicodeError(error_msg)

[docs]    def _process_markdown_files(self, files: List[str]) -> List[str]:
        """Process markdown files in parallel.
        
        Args:
            files: List of markdown filenames to process
            
        Returns:
            List[str]: Processed markdown content
            
        Raises:
            RuntimeError: If any file processing fails
        """
        processed_content = []
        errors = []

        def process_file(filename: str) -> str:
            try:
                file_path = self.output_dir / filename
                content = self._read_file_with_fallback_encoding(file_path)
                self.log_queue.put(f"Processed {filename}")
                return content
            except Exception as e:
                errors.append((filename, str(e)))
                return ""

        with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
            processed_content = list(executor.map(process_file, files))

        if errors:
            error_msg = "\n".join(f"{f}: {e}" for f, e in errors)
            raise RuntimeError(f"Failed to process files:\n{error_msg}")

        return processed_content

[docs]    def convert_to_pdf(self, output_file: Path) -> None:
        """Convert markdown files to PDF using pandoc.
        
        Args:
            output_file: Path where the output PDF will be saved
            
        Raises:
            RuntimeError: If conversion fails
            FileNotFoundError: If required dependencies are missing
        """
        try:
            self._check_dependencies()
            files = self._parse_toc()
            
            # Process files in parallel
            contents = self._process_markdown_files(files)
            
            # Create temporary combined file
            temp_md = self.output_dir / self.config.temp_filename
            self._create_combined_markdown(temp_md, contents)
            
            # Run pandoc conversion
            self._run_pandoc_conversion(temp_md, output_file)
            
        except Exception as e:
            logger.error(f"PDF conversion failed: {e}")
            raise
        finally:
            # Clean up
            self._cleanup()

[docs]    def _create_combined_markdown(self, temp_md: Path, contents: List[str]):
        """Create a combined markdown file from processed markdown content."""
        # Add title page without BOM
        title = self.toc_file.stem.replace("_", " ").title()
        combined_content = [
            f"---",
            f"title: {title}",
            f"date: {datetime.datetime.now().strftime('%Y-%m-%d')}",
            f"---",
            "\n\n",
        ]

        # Keep track of seen headers to avoid duplicates
        seen_headers = set()

        # Process and combine all markdown content
        for content in contents:
            lines = content.split('\n')
            processed_lines = []
            
            i = 0
            while i < len(lines):
                line = lines[i].rstrip()
                
                # Skip empty lines at the start
                if not processed_lines and not line:
                    i += 1
                    continue

                # Check for headers (both # style and === style)
                is_header = False
                header_text = None

                # Check for ATX style headers (# or ##)
                if line.strip().startswith(('#', '##')):
                    header_text = line.strip()
                    is_header = True

                # Check for Setext style headers (=== or ---)
                elif (i + 1 < len(lines) and 
                      lines[i + 1].strip() and 
                      all(c in '=-' for c in lines[i + 1].strip())):
                    header_text = line.strip()
                    is_header = True
                    i += 1  # Skip the underline

                # Process header
                if is_header and header_text:
                    if header_text not in seen_headers:
                        seen_headers.add(header_text)
                        processed_lines.append(line)
                        if i + 1 < len(lines) and all(c in '=-' for c in lines[i + 1].strip()):
                            processed_lines.append(lines[i + 1])
                else:
                    processed_lines.append(line)

                i += 1

            # Add processed content with double newlines between sections
            if processed_lines:
                combined_content.append('\n'.join(processed_lines))

        # Write combined content to temporary file with UTF-8 encoding (no BOM)
        logger.debug(f"Writing combined content with {len(combined_content)} sections")
        temp_md.write_text('\n\n'.join(combined_content), encoding='utf-8')

[docs]    def _run_pandoc_conversion(self, temp_md: Path, output_file: Path):
        """Run pandoc conversion using the combined markdown file."""
        # Get title from toc file
        title = self.toc_file.stem.replace("_", " ").title()
        
        # Create a temporary header file with proven LaTeX settings
        header_file = self.output_dir / "header.tex"
        try:
            # Simplified LaTeX header with essential settings
            header_content = r"""
\usepackage{xcolor}
\usepackage{listings}
\usepackage{geometry}
\usepackage{fancyhdr}
\usepackage{titlesec}

% Page setup
\geometry{a4paper, margin=1in}
\pagestyle{fancy}
\fancyhf{}
\fancyhead[L]{\nouppercase{\leftmark}}
\fancyhead[R]{\thepage}
\renewcommand{\headrulewidth}{0.4pt}

% Chapter and section formatting - removed numbering
\titleformat{\chapter}
{\normalfont\huge\bfseries}{}{0pt}{\Huge}
\titlespacing*{\chapter}{0pt}{-20pt}{40pt}

% Remove section numbering
\setcounter{secnumdepth}{0}

% Code block styling
\definecolor{codebackground}{RGB}{248,248,248}
\definecolor{codecomment}{RGB}{106,153,85}
\definecolor{codekeyword}{RGB}{86,156,214}
\definecolor{codestring}{RGB}{206,145,120}

% Configure listings for copy-friendly code blocks
\lstset{
    basicstyle=\ttfamily\small,
    backgroundcolor=\color{codebackground},
    breaklines=true,
    breakatwhitespace=false,
    numbers=none,  % Removed line numbers
    keywordstyle=\color{codekeyword},
    commentstyle=\color{codecomment},
    stringstyle=\color{codestring},
    frame=single,
    tabsize=4,
    showstringspaces=false,
    showspaces=false,
    showtabs=false,
    captionpos=b,
    breakindent=0pt,
    xleftmargin=0.5em,  % Reduced left margin since we removed line numbers
    xrightmargin=0.5em,
    language=Python,
    escapeinside={(*@}{@*)},
    keepspaces=true,
    columns=flexible,
    basewidth=0.5em,
    mathescape=true,
    upquote=true,  % Use straight quotes
    literate={*}{{\char42}}1  % Fix asterisk rendering
}

% Fix for long code blocks
\lstset{
    breaklines=true,
    postbreak=\mbox{\textcolor{red}{$\hookrightarrow$}\space},
    breakindent=0pt
}
"""
            header_file.write_text(header_content, encoding='utf-8')
            
            # Build pandoc command - removed --number-sections
            cmd = [
                "pandoc",
                str(temp_md),
                "--pdf-engine=xelatex",
                "--toc",
                "--toc-depth=3",
                "--top-level-division=chapter",
                "-V",
                "documentclass=report",
                "-V",
                f"title={title}",
                "--highlight-style=pygments",
                "-f",
                "markdown+smart+fenced_code_blocks+auto_identifiers",
                "--listings",
                f"--include-in-header={header_file}",
                "--wrap=none",
                "-V",
                "papersize=a4",
                "-V",
                "fontsize=11pt",
                "-V",
                "geometry:margin=1in",
                "-V",
                "linkcolor=blue",
                "--variable",
                "urlcolor=blue",
                "--variable",
                "toccolor=black",
                "-V",
                "colorlinks=true",
                "--pdf-engine-opt=-shell-escape",
                "--verbose",
                "-o",
                str(output_file),
            ]

            logger.info(f"Running pandoc command: {' '.join(cmd)}")

            try:
                result = subprocess.run(
                    cmd, 
                    capture_output=True, 
                    text=True, 
                    check=True, 
                    encoding="utf-8"
                )
                
                if result.stderr:
                    logger.warning(f"Pandoc warnings: {result.stderr}")
                    
                logger.info(f"Successfully created PDF: {output_file}")
                
            except subprocess.CalledProcessError as e:
                logger.error(f"Pandoc conversion failed with return code {e.returncode}")
                logger.error(f"Error output:\n{e.stderr}")
                raise

        except Exception as e:
            logger.error(f"PDF conversion failed: {str(e)}")
            raise
        finally:
            # Clean up temporary header file
            if header_file.exists():
                header_file.unlink()

[docs]    def _cleanup(self):
        """Clean up temporary files and resources."""
        # Clean up temporary file
        if (self.output_dir / self.config.temp_filename).exists():
            (self.output_dir / self.config.temp_filename).unlink()
            logger.debug("Cleaned up temporary markdown file")


[docs]class ConverterGUI:
    """GUI interface for the markdown to PDF converter."""

[docs]    def __init__(self):
        """Initialize the GUI."""
        self.root = tk.Tk()
        self.root.title("Markdown to PDF Converter")
        self.root.geometry("800x600")

        # Initialize paths
        self.toc_path: Optional[Path] = None
        self.input_dir: Optional[Path] = None
        self.output_file: Optional[Path] = None
        self.css_file: Optional[Path] = None

        self._create_widgets()

[docs]    def _create_widgets(self):
        """Create and arrange GUI widgets."""
        # Main container
        main_frame = ttk.Frame(self.root, padding="10")
        main_frame.pack(fill=tk.BOTH, expand=True)

        # File Selection Frame
        file_frame = ttk.LabelFrame(main_frame, text="File Selection", padding="10")
        file_frame.pack(fill=tk.X, pady=(0, 10))

        # TOC File Selection
        toc_frame = ttk.Frame(file_frame)
        toc_frame.pack(fill=tk.X, pady=5)

        ttk.Label(toc_frame, text="Table of Contents:").pack(side=tk.LEFT)
        self.toc_var = tk.StringVar()
        ttk.Entry(toc_frame, textvariable=self.toc_var, width=50).pack(
            side=tk.LEFT, padx=5
        )
        ttk.Button(toc_frame, text="Browse", command=self._select_toc).pack(
            side=tk.LEFT
        )

        # Input Directory Selection
        input_frame = ttk.Frame(file_frame)
        input_frame.pack(fill=tk.X, pady=5)

        ttk.Label(input_frame, text="Input Directory:").pack(side=tk.LEFT)
        self.input_var = tk.StringVar()
        ttk.Entry(input_frame, textvariable=self.input_var, width=50).pack(
            side=tk.LEFT, padx=5
        )
        ttk.Button(input_frame, text="Browse", command=self._select_input_dir).pack(
            side=tk.LEFT
        )

        # Output File Selection
        output_frame = ttk.Frame(file_frame)
        output_frame.pack(fill=tk.X, pady=5)

        ttk.Label(output_frame, text="Output PDF:").pack(side=tk.LEFT)
        self.output_var = tk.StringVar()
        ttk.Entry(output_frame, textvariable=self.output_var, width=50).pack(
            side=tk.LEFT, padx=5
        )
        ttk.Button(output_frame, text="Browse", command=self._select_output).pack(
            side=tk.LEFT
        )

        # CSS File Selection (Optional)
        css_frame = ttk.Frame(file_frame)
        css_frame.pack(fill=tk.X, pady=5)

        ttk.Label(css_frame, text="Custom CSS (Optional):").pack(side=tk.LEFT)
        self.css_var = tk.StringVar()
        ttk.Entry(css_frame, textvariable=self.css_var, width=50).pack(
            side=tk.LEFT, padx=5
        )
        ttk.Button(css_frame, text="Browse", command=self._select_css).pack(
            side=tk.LEFT
        )

        # Log Display
        log_frame = ttk.LabelFrame(main_frame, text="Logs", padding="10")
        log_frame.pack(fill=tk.BOTH, expand=True, pady=(0, 10))

        self.log_display = scrolledtext.ScrolledText(log_frame, height=15)
        self.log_display.pack(fill=tk.BOTH, expand=True)

        # Convert Button
        self.convert_button = ttk.Button(
            main_frame,
            text="Convert to PDF",
            command=self._start_conversion,
            state=tk.DISABLED,
        )
        self.convert_button.pack(pady=10)

[docs]    def _select_toc(self):
        """Handle table of contents file selection."""
        file_path = filedialog.askopenfilename(
            title="Select Table of Contents",
            filetypes=[("Markdown files", "*.md"), ("All files", "*.*")],
        )
        if file_path:
            self.toc_path = Path(file_path)
            self.toc_var.set(str(self.toc_path))
            self._validate_inputs()
            self.update_log(f"Selected table of contents: {self.toc_path}")

[docs]    def _select_input_dir(self):
        """Handle input directory selection."""
        dir_path = filedialog.askdirectory(title="Select Input Directory")
        if dir_path:
            self.input_dir = Path(dir_path)
            self.input_var.set(str(self.input_dir))
            self._validate_inputs()
            self.update_log(f"Selected input directory: {self.input_dir}")

[docs]    def _select_output(self):
        """Handle output PDF file selection."""
        file_path = filedialog.asksaveasfilename(
            title="Save PDF As",
            defaultextension=".pdf",
            filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")],
        )
        if file_path:
            self.output_file = Path(file_path)
            self.output_var.set(str(self.output_file))
            self._validate_inputs()
            self.update_log(f"Selected output file: {self.output_file}")

[docs]    def _select_css(self):
        """Handle CSS file selection."""
        file_path = filedialog.askopenfilename(
            title="Select CSS File",
            filetypes=[("CSS files", "*.css"), ("All files", "*.*")],
        )
        if file_path:
            self.css_file = Path(file_path)
            self.css_var.set(str(self.css_file))
            self.update_log(f"Selected CSS file: {self.css_file}")

[docs]    def _validate_inputs(self):
        """Validate inputs and enable/disable convert button."""
        valid = (
            self.toc_path is not None
            and self.toc_path.exists()
            and self.input_dir is not None
            and self.input_dir.exists()
            and self.output_file is not None
        )
        self.convert_button.config(state=tk.NORMAL if valid else tk.DISABLED)

[docs]    def _start_conversion(self):
        """Start the PDF conversion process."""
        self.convert_button.config(state=tk.DISABLED)
        self.update_log("Starting conversion...")

        def conversion_thread():
            try:
                converter = MarkdownToPDFConverter(
                    self.toc_path, self.input_dir, self.css_file
                )
                converter.convert_to_pdf(self.output_file)
                self.update_log("Conversion completed successfully!")

            except Exception as e:
                self.update_log(f"Error during conversion: {str(e)}")
                logger.error(f"Conversion error: {e}")

            finally:
                self.root.after(0, lambda: self.convert_button.config(state=tk.NORMAL))

        # Run conversion in separate thread
        thread = threading.Thread(target=conversion_thread)
        thread.daemon = True
        thread.start()

[docs]    def update_log(self, message: str):
        """Update the log display."""
        timestamp = datetime.datetime.now().strftime("%H:%M:%S")
        formatted_message = f"[{timestamp}] {message}"

        def update():
            self.log_display.insert(tk.END, f"{formatted_message}\n")
            self.log_display.see(tk.END)

        self.root.after(0, update)
        logger.info(message)

[docs]    def run(self):
        """Start the GUI main loop."""
        self.root.mainloop()


[docs]def main():
    """Main entry point."""
    try:
        app = ConverterGUI()
        app.run()
    except Exception as e:
        logger.error(f"Application error: {e}")
        raise


if __name__ == "__main__":
    main()