import logging
import tkinter as tk
from tkinter import ttk, filedialog, scrolledtext
from pathlib import Path
from typing import Optional, List, Dict, Any
import datetime
import threading
import re
import subprocess
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
import queue
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[logging.FileHandler("pdf_conversion.log"), logging.StreamHandler()],
)
logger = logging.getLogger(__name__)
[docs]@dataclass
class ConverterConfig:
"""Configuration settings for the PDF converter.
Attributes:
max_workers (int): Maximum number of worker threads for parallel processing
supported_encodings (List[str]): List of encodings to try when reading files
temp_filename (str): Name of temporary combined markdown file
pandoc_options (Dict[str, Any]): Additional pandoc conversion options
"""
max_workers: int = 4
supported_encodings: List[str] = field(default_factory=lambda: [
"utf-8", "utf-8-sig", "gbk", "gb2312", "gb18030", "latin1"
])
temp_filename: str = "combined_temp.md"
pandoc_options: Dict[str, Any] = field(default_factory=lambda: {
"toc_depth": 3,
"margin": "1in",
"highlight_style": "tango"
})
[docs]class MarkdownToPDFConverter:
"""Converts markdown files to a single PDF document using pandoc."""
[docs] def __init__(
self, toc_file: Path, output_dir: Path, css_file: Optional[Path] = None,
config: Optional[ConverterConfig] = None
):
"""Initialize the converter.
Args:
toc_file: Path to table of contents markdown file
output_dir: Directory containing markdown files
css_file: Optional path to custom CSS file
config: Optional configuration settings
Raises:
ValueError: If toc_file or output_dir don't exist
"""
self.toc_file = Path(toc_file)
self.output_dir = Path(output_dir)
self.css_file = css_file
self.config = config or ConverterConfig()
# Validate paths
if not self.toc_file.exists():
raise ValueError(f"TOC file not found: {self.toc_file}")
if not self.output_dir.is_dir():
raise ValueError(f"Invalid output directory: {self.output_dir}")
# Initialize thread-safe queue for logging
self.log_queue = queue.Queue()
[docs] def _check_dependencies(self) -> None:
"""Check if required dependencies are installed."""
try:
# Check pandoc
result = subprocess.run(
["pandoc", "--version"], capture_output=True, text=True
)
logger.info("Found pandoc installation")
# Check wkhtmltopdf - try multiple possible locations
wkhtmltopdf_paths = [
"wkhtmltopdf", # If in PATH
r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe",
r"C:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exe",
]
for path in wkhtmltopdf_paths:
try:
result = subprocess.run(
[path, "--version"], capture_output=True, text=True
)
self.wkhtmltopdf_path = path # Store the working path
logger.info(f"Found wkhtmltopdf at: {path}")
break
except FileNotFoundError:
continue
else:
raise FileNotFoundError(
"wkhtmltopdf not found. Please install wkhtmltopdf from: "
"https://wkhtmltopdf.org/downloads.html"
)
except FileNotFoundError as e:
if "pandoc" in str(e):
raise RuntimeError("pandoc not found. Please install pandoc first.")
raise
[docs] def _parse_toc(self) -> List[str]:
"""Parse table of contents to get ordered list of markdown files.
Returns:
List[str]: Ordered list of markdown filenames
Example ordering:
chapter-1.md
chapter-1-1.md
chapter-1-1-1.md
chapter-1-2.md
chapter-2.md
chapter-2-1.md
"""
try:
logger.info("Parsing table of contents...")
# Get all markdown files that start with "chapter-" in the directory
all_md_files = [
f.name for f in self.output_dir.glob("chapter-*.md")
]
logger.info(f"Found {len(all_md_files)} chapter files in directory")
if not all_md_files:
logger.warning("No chapter files found in directory")
return []
# Custom sorting function for chapter files
def chapter_sort_key(filename: str) -> tuple:
"""Create a sort key for chapter filenames.
Args:
filename: The filename to create a sort key for
Returns:
A tuple of integers representing the chapter number
Example:
'chapter-1-2.md' -> (1, 2)
'chapter-1. introduction.md' -> (1,)
"""
# Remove 'chapter-' prefix and '.md' suffix
base = filename.replace('chapter-', '').replace('.md', '')
# Extract all numbers from the start of the string
numbers = []
for part in base.split('-'):
# Extract first number from the part
match = re.match(r'(\d+)', part)
if match:
numbers.append(int(match.group(1)))
else:
break
return tuple(numbers) if numbers else (float('inf'),) # Return inf if no numbers found
# Sort files using the custom sort key
sorted_files = sorted(all_md_files, key=chapter_sort_key)
logger.info("Sorted chapter files in correct order:")
for f in sorted_files:
logger.debug(f" {f}")
return sorted_files
except Exception as e:
logger.error(f"Error parsing table of contents: {e}")
raise
[docs] def _read_file_with_fallback_encoding(self, file_path: Path) -> str:
"""Read file content with fallback encodings."""
encodings = ["utf-8-sig", "utf-8", "latin1"] # Simplified encoding list
errors = []
for encoding in encodings:
try:
logger.debug(f"Trying to read {file_path} with {encoding} encoding")
with open(file_path, 'r', encoding=encoding) as f:
content = f.read()
# Normalize line endings
content = content.replace('\r\n', '\n')
return content
except UnicodeError as e:
errors.append(f"{encoding}: {str(e)}")
continue
error_msg = f"Failed to read {file_path} with any supported encoding:\n" + "\n".join(errors)
logger.error(error_msg)
raise UnicodeError(error_msg)
[docs] def _process_markdown_files(self, files: List[str]) -> List[str]:
"""Process markdown files in parallel.
Args:
files: List of markdown filenames to process
Returns:
List[str]: Processed markdown content
Raises:
RuntimeError: If any file processing fails
"""
processed_content = []
errors = []
def process_file(filename: str) -> str:
try:
file_path = self.output_dir / filename
content = self._read_file_with_fallback_encoding(file_path)
self.log_queue.put(f"Processed {filename}")
return content
except Exception as e:
errors.append((filename, str(e)))
return ""
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
processed_content = list(executor.map(process_file, files))
if errors:
error_msg = "\n".join(f"{f}: {e}" for f, e in errors)
raise RuntimeError(f"Failed to process files:\n{error_msg}")
return processed_content
[docs] def convert_to_pdf(self, output_file: Path) -> None:
"""Convert markdown files to PDF using pandoc.
Args:
output_file: Path where the output PDF will be saved
Raises:
RuntimeError: If conversion fails
FileNotFoundError: If required dependencies are missing
"""
try:
self._check_dependencies()
files = self._parse_toc()
# Process files in parallel
contents = self._process_markdown_files(files)
# Create temporary combined file
temp_md = self.output_dir / self.config.temp_filename
self._create_combined_markdown(temp_md, contents)
# Run pandoc conversion
self._run_pandoc_conversion(temp_md, output_file)
except Exception as e:
logger.error(f"PDF conversion failed: {e}")
raise
finally:
# Clean up
self._cleanup()
[docs] def _create_combined_markdown(self, temp_md: Path, contents: List[str]):
"""Create a combined markdown file from processed markdown content."""
# Add title page without BOM
title = self.toc_file.stem.replace("_", " ").title()
combined_content = [
f"---",
f"title: {title}",
f"date: {datetime.datetime.now().strftime('%Y-%m-%d')}",
f"---",
"\n\n",
]
# Keep track of seen headers to avoid duplicates
seen_headers = set()
# Process and combine all markdown content
for content in contents:
lines = content.split('\n')
processed_lines = []
i = 0
while i < len(lines):
line = lines[i].rstrip()
# Skip empty lines at the start
if not processed_lines and not line:
i += 1
continue
# Check for headers (both # style and === style)
is_header = False
header_text = None
# Check for ATX style headers (# or ##)
if line.strip().startswith(('#', '##')):
header_text = line.strip()
is_header = True
# Check for Setext style headers (=== or ---)
elif (i + 1 < len(lines) and
lines[i + 1].strip() and
all(c in '=-' for c in lines[i + 1].strip())):
header_text = line.strip()
is_header = True
i += 1 # Skip the underline
# Process header
if is_header and header_text:
if header_text not in seen_headers:
seen_headers.add(header_text)
processed_lines.append(line)
if i + 1 < len(lines) and all(c in '=-' for c in lines[i + 1].strip()):
processed_lines.append(lines[i + 1])
else:
processed_lines.append(line)
i += 1
# Add processed content with double newlines between sections
if processed_lines:
combined_content.append('\n'.join(processed_lines))
# Write combined content to temporary file with UTF-8 encoding (no BOM)
logger.debug(f"Writing combined content with {len(combined_content)} sections")
temp_md.write_text('\n\n'.join(combined_content), encoding='utf-8')
[docs] def _run_pandoc_conversion(self, temp_md: Path, output_file: Path):
"""Run pandoc conversion using the combined markdown file."""
# Get title from toc file
title = self.toc_file.stem.replace("_", " ").title()
# Create a temporary header file with proven LaTeX settings
header_file = self.output_dir / "header.tex"
try:
# Simplified LaTeX header with essential settings
header_content = r"""
\usepackage{xcolor}
\usepackage{listings}
\usepackage{geometry}
\usepackage{fancyhdr}
\usepackage{titlesec}
% Page setup
\geometry{a4paper, margin=1in}
\pagestyle{fancy}
\fancyhf{}
\fancyhead[L]{\nouppercase{\leftmark}}
\fancyhead[R]{\thepage}
\renewcommand{\headrulewidth}{0.4pt}
% Chapter and section formatting - removed numbering
\titleformat{\chapter}
{\normalfont\huge\bfseries}{}{0pt}{\Huge}
\titlespacing*{\chapter}{0pt}{-20pt}{40pt}
% Remove section numbering
\setcounter{secnumdepth}{0}
% Code block styling
\definecolor{codebackground}{RGB}{248,248,248}
\definecolor{codecomment}{RGB}{106,153,85}
\definecolor{codekeyword}{RGB}{86,156,214}
\definecolor{codestring}{RGB}{206,145,120}
% Configure listings for copy-friendly code blocks
\lstset{
basicstyle=\ttfamily\small,
backgroundcolor=\color{codebackground},
breaklines=true,
breakatwhitespace=false,
numbers=none, % Removed line numbers
keywordstyle=\color{codekeyword},
commentstyle=\color{codecomment},
stringstyle=\color{codestring},
frame=single,
tabsize=4,
showstringspaces=false,
showspaces=false,
showtabs=false,
captionpos=b,
breakindent=0pt,
xleftmargin=0.5em, % Reduced left margin since we removed line numbers
xrightmargin=0.5em,
language=Python,
escapeinside={(*@}{@*)},
keepspaces=true,
columns=flexible,
basewidth=0.5em,
mathescape=true,
upquote=true, % Use straight quotes
literate={*}{{\char42}}1 % Fix asterisk rendering
}
% Fix for long code blocks
\lstset{
breaklines=true,
postbreak=\mbox{\textcolor{red}{$\hookrightarrow$}\space},
breakindent=0pt
}
"""
header_file.write_text(header_content, encoding='utf-8')
# Build pandoc command - removed --number-sections
cmd = [
"pandoc",
str(temp_md),
"--pdf-engine=xelatex",
"--toc",
"--toc-depth=3",
"--top-level-division=chapter",
"-V",
"documentclass=report",
"-V",
f"title={title}",
"--highlight-style=pygments",
"-f",
"markdown+smart+fenced_code_blocks+auto_identifiers",
"--listings",
f"--include-in-header={header_file}",
"--wrap=none",
"-V",
"papersize=a4",
"-V",
"fontsize=11pt",
"-V",
"geometry:margin=1in",
"-V",
"linkcolor=blue",
"--variable",
"urlcolor=blue",
"--variable",
"toccolor=black",
"-V",
"colorlinks=true",
"--pdf-engine-opt=-shell-escape",
"--verbose",
"-o",
str(output_file),
]
logger.info(f"Running pandoc command: {' '.join(cmd)}")
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True,
encoding="utf-8"
)
if result.stderr:
logger.warning(f"Pandoc warnings: {result.stderr}")
logger.info(f"Successfully created PDF: {output_file}")
except subprocess.CalledProcessError as e:
logger.error(f"Pandoc conversion failed with return code {e.returncode}")
logger.error(f"Error output:\n{e.stderr}")
raise
except Exception as e:
logger.error(f"PDF conversion failed: {str(e)}")
raise
finally:
# Clean up temporary header file
if header_file.exists():
header_file.unlink()
[docs] def _cleanup(self):
"""Clean up temporary files and resources."""
# Clean up temporary file
if (self.output_dir / self.config.temp_filename).exists():
(self.output_dir / self.config.temp_filename).unlink()
logger.debug("Cleaned up temporary markdown file")
[docs]class ConverterGUI:
"""GUI interface for the markdown to PDF converter."""
[docs] def __init__(self):
"""Initialize the GUI."""
self.root = tk.Tk()
self.root.title("Markdown to PDF Converter")
self.root.geometry("800x600")
# Initialize paths
self.toc_path: Optional[Path] = None
self.input_dir: Optional[Path] = None
self.output_file: Optional[Path] = None
self.css_file: Optional[Path] = None
self._create_widgets()
[docs] def _select_toc(self):
"""Handle table of contents file selection."""
file_path = filedialog.askopenfilename(
title="Select Table of Contents",
filetypes=[("Markdown files", "*.md"), ("All files", "*.*")],
)
if file_path:
self.toc_path = Path(file_path)
self.toc_var.set(str(self.toc_path))
self._validate_inputs()
self.update_log(f"Selected table of contents: {self.toc_path}")
[docs] def _select_output(self):
"""Handle output PDF file selection."""
file_path = filedialog.asksaveasfilename(
title="Save PDF As",
defaultextension=".pdf",
filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")],
)
if file_path:
self.output_file = Path(file_path)
self.output_var.set(str(self.output_file))
self._validate_inputs()
self.update_log(f"Selected output file: {self.output_file}")
[docs] def _select_css(self):
"""Handle CSS file selection."""
file_path = filedialog.askopenfilename(
title="Select CSS File",
filetypes=[("CSS files", "*.css"), ("All files", "*.*")],
)
if file_path:
self.css_file = Path(file_path)
self.css_var.set(str(self.css_file))
self.update_log(f"Selected CSS file: {self.css_file}")
[docs] def _start_conversion(self):
"""Start the PDF conversion process."""
self.convert_button.config(state=tk.DISABLED)
self.update_log("Starting conversion...")
def conversion_thread():
try:
converter = MarkdownToPDFConverter(
self.toc_path, self.input_dir, self.css_file
)
converter.convert_to_pdf(self.output_file)
self.update_log("Conversion completed successfully!")
except Exception as e:
self.update_log(f"Error during conversion: {str(e)}")
logger.error(f"Conversion error: {e}")
finally:
self.root.after(0, lambda: self.convert_button.config(state=tk.NORMAL))
# Run conversion in separate thread
thread = threading.Thread(target=conversion_thread)
thread.daemon = True
thread.start()
[docs] def update_log(self, message: str):
"""Update the log display."""
timestamp = datetime.datetime.now().strftime("%H:%M:%S")
formatted_message = f"[{timestamp}] {message}"
def update():
self.log_display.insert(tk.END, f"{formatted_message}\n")
self.log_display.see(tk.END)
self.root.after(0, update)
logger.info(message)
[docs] def run(self):
"""Start the GUI main loop."""
self.root.mainloop()
[docs]def main():
"""Main entry point."""
try:
app = ConverterGUI()
app.run()
except Exception as e:
logger.error(f"Application error: {e}")
raise
if __name__ == "__main__":
main()