#!/usr/bin/env python3 """ Convert a directory of DOCX documents to Markdown format with proper formatting. """ import os import re import argparse from pathlib import Path from typing import Optional, List # Check each library individually MAMMOTH_AVAILABLE = False PANDOC_AVAILABLE = False DOCX_AVAILABLE = False try: import mammoth MAMMOTH_AVAILABLE = True print("✓ mammoth library available") except ImportError: print("✗ mammoth not available") try: import pypandoc PANDOC_AVAILABLE = True print("✓ pypandoc library available") except ImportError: print("✗ pypandoc not available") try: from docx import Document from markdownify import markdownify as md DOCX_AVAILABLE = True print("✓ python-docx library available") except ImportError: print("✗ python-docx not available") # Show available methods available_methods = [] if MAMMOTH_AVAILABLE: available_methods.append("mammoth") if PANDOC_AVAILABLE: available_methods.append("pandoc") if DOCX_AVAILABLE: available_methods.append("python-docx") if available_methods: print(f"Available conversion methods: {', '.join(available_methods)}") else: print("No conversion libraries available!") print("Install with: pip install mammoth pypandoc python-docx markdownify") def clean_markdown_text(text: str) -> str: """ Clean and format markdown text using better approaches than manual regex. This function addresses common formatting issues: - Multiple whitespaces - Inconsistent list formatting - Paragraph spacing - Line break normalization """ # 1. Normalize line endings text = text.replace("\r\n", "\n").replace("\r", "\n") # 2. Fix multiple consecutive spaces (but preserve code blocks) lines = text.split("\n") cleaned_lines = [] in_code_block = False for line in lines: # Check if we're entering/exiting a code block if line.strip().startswith("```"): in_code_block = not in_code_block cleaned_lines.append(line) continue if not in_code_block: # Replace multiple spaces with single space, but preserve indentation leading_spaces = len(line) - len(line.lstrip()) content = line.lstrip() # Collapse multiple spaces in content content = re.sub(r" {2,}", " ", content) line = " " * leading_spaces + content cleaned_lines.append(line) text = "\n".join(cleaned_lines) # 3. Fix list formatting # Ensure proper spacing for unordered lists text = re.sub(r"^(\s*)[-*+](\S)", r"\1- \2", text, flags=re.MULTILINE) # Ensure proper spacing for ordered lists text = re.sub(r"^(\s*)(\d+\.)(\S)", r"\1\2 \3", text, flags=re.MULTILINE) # 4. Fix paragraph spacing - remove excessive blank lines text = re.sub(r"\n{3,}", "\n\n", text) # 5. Clean up common punctuation issues (more targeted than your original) # Fix missing space after punctuation at end of sentences text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text) # Fix missing space after commas (but not in numbers) text = re.sub(r",(?=[a-zA-Z])", ", ", text) # Fix missing space after colons and semicolons text = re.sub(r"([;:])(?=[a-zA-Z])", r"\1 ", text) # Remove spaces before punctuation text = re.sub(r" +([,.;:!?])", r"\1", text) # 6. Clean up markdown formatting issues # Fix bold/italic markers with spaces text = re.sub(r"\*\* +", "**", text) text = re.sub(r" +\*\*", "**", text) text = re.sub(r"\* +", "*", text) text = re.sub(r" +\*", "*", text) # 7. Final cleanup text = text.strip() return text def convert_docx_with_mammoth(docx_path: Path) -> str: """Convert DOCX to Markdown using mammoth (recommended approach).""" try: with open(docx_path, "rb") as docx_file: result = mammoth.convert_to_markdown(docx_file) if result.messages: print(f"Mammoth warnings: {result.messages}") return result.value except Exception as e: print(f"Mammoth conversion error: {e}") raise def convert_docx_with_pandoc(docx_path: Path) -> str: """Convert DOCX to Markdown using pandoc (alternative approach).""" return pypandoc.convert_file(str(docx_path), "md", format="docx") def convert_docx_with_python_docx(docx_path: Path) -> str: """Convert DOCX to Markdown using python-docx (basic fallback).""" doc = Document(docx_path) content = [] for paragraph in doc.paragraphs: # Simple paragraph to markdown conversion text = paragraph.text.strip() if text: # Basic heading detection (you might want to improve this) if paragraph.style.name.startswith("Heading"): level = ( int(paragraph.style.name.split()[-1]) if paragraph.style.name.split()[-1].isdigit() else 1 ) content.append("#" * level + " " + text) else: content.append(text) content.append("") # Add blank line after each paragraph # Handle tables for table in doc.tables: content.append("") # Add blank line before table for i, row in enumerate(table.rows): row_data = [cell.text.strip() for cell in row.cells] content.append("| " + " | ".join(row_data) + " |") if i == 0: # Add separator after header content.append("|" + "---|" * len(row_data)) content.append("") # Add blank line after table return "\n".join(content) def convert_single_docx( input_path: Path, output_dir: Path, method: str = "mammoth" ) -> bool: """ Convert a single DOCX file to Markdown. Args: input_path: Path to the DOCX file output_dir: Directory to save the Markdown file method: Conversion method ('mammoth', 'pandoc', or 'python-docx') Returns: True if successful, False otherwise """ try: print(f"Converting: {input_path.name}") # Choose conversion method if method == "mammoth" and MAMMOTH_AVAILABLE: markdown_content = convert_docx_with_mammoth(input_path) elif method == "pandoc" and PANDOC_AVAILABLE: markdown_content = convert_docx_with_pandoc(input_path) elif method == "python-docx" and DOCX_AVAILABLE: markdown_content = convert_docx_with_python_docx(input_path) else: print(f"Method '{method}' not available. Trying alternatives...") if MAMMOTH_AVAILABLE: markdown_content = convert_docx_with_mammoth(input_path) elif PANDOC_AVAILABLE: markdown_content = convert_docx_with_pandoc(input_path) elif DOCX_AVAILABLE: markdown_content = convert_docx_with_python_docx(input_path) else: print("No conversion libraries available!") return False # Clean and format the markdown cleaned_content = clean_markdown_text(markdown_content) # Create output file path output_path = output_dir / f"{input_path.stem}.md" # Write the markdown file with open(output_path, "w", encoding="utf-8") as f: f.write(cleaned_content) print(f"✓ Saved: {output_path}") return True except Exception as e: print(f"✗ Error converting {input_path.name}: {str(e)}") return False def convert_directory( input_dir: str, output_dir: str, method: str = "mammoth", recursive: bool = False ): """ Convert all DOCX files in a directory to Markdown. Args: input_dir: Directory containing DOCX files output_dir: Directory to save Markdown files method: Conversion method to use recursive: Whether to search subdirectories """ input_path = Path(input_dir) output_path = Path(output_dir) # Validate input directory if not input_path.exists(): print(f"Error: Input directory '{input_dir}' does not exist.") return # Create output directory if it doesn't exist output_path.mkdir(parents=True, exist_ok=True) # Find all DOCX files if recursive: docx_files = list(input_path.rglob("*.docx")) else: docx_files = list(input_path.glob("*.docx")) # Filter out temporary files (start with ~$) docx_files = [f for f in docx_files if not f.name.startswith("~$")] if not docx_files: print("No DOCX files found in the specified directory.") return print(f"Found {len(docx_files)} DOCX files to convert...") # Convert each file successful = 0 failed = 0 for docx_file in docx_files: if convert_single_docx(docx_file, output_path, method): successful += 1 else: failed += 1 print(f"\nConversion complete:") print(f"✓ Successful: {successful}") print(f"✗ Failed: {failed}") def main(): parser = argparse.ArgumentParser( description="Convert DOCX documents to Markdown format", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python docx_to_md.py /path/to/docx/files /path/to/output python docx_to_md.py ./documents ./markdown --method pandoc --recursive python docx_to_md.py ~/docs ~/markdown -r """, ) parser.add_argument("input_dir", help="Directory containing DOCX files") parser.add_argument("output_dir", help="Directory to save Markdown files") parser.add_argument( "--method", "-m", choices=["mammoth", "pandoc", "python-docx"], default="mammoth", help="Conversion method (default: mammoth)", ) parser.add_argument( "--recursive", "-r", action="store_true", help="Search subdirectories recursively", ) args = parser.parse_args() print("DOCX to Markdown Converter") print("=" * 30) print(f"Input directory: {args.input_dir}") print(f"Output directory: {args.output_dir}") print(f"Method: {args.method}") print(f"Recursive: {args.recursive}") print() convert_directory(args.input_dir, args.output_dir, args.method, args.recursive) if __name__ == "__main__": main()