update
This commit is contained in:
333
data_structure/docx_to_md.py
Normal file
333
data_structure/docx_to_md.py
Normal file
@@ -0,0 +1,333 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert a directory of DOCX documents to Markdown format with proper formatting.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
|
||||
# Check each library individually
|
||||
MAMMOTH_AVAILABLE = False
|
||||
PANDOC_AVAILABLE = False
|
||||
DOCX_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import mammoth
|
||||
|
||||
MAMMOTH_AVAILABLE = True
|
||||
print("✓ mammoth library available")
|
||||
except ImportError:
|
||||
print("✗ mammoth not available")
|
||||
|
||||
try:
|
||||
import pypandoc
|
||||
|
||||
PANDOC_AVAILABLE = True
|
||||
print("✓ pypandoc library available")
|
||||
except ImportError:
|
||||
print("✗ pypandoc not available")
|
||||
|
||||
try:
|
||||
from docx import Document
|
||||
from markdownify import markdownify as md
|
||||
|
||||
DOCX_AVAILABLE = True
|
||||
print("✓ python-docx library available")
|
||||
except ImportError:
|
||||
print("✗ python-docx not available")
|
||||
|
||||
# Show available methods
|
||||
available_methods = []
|
||||
if MAMMOTH_AVAILABLE:
|
||||
available_methods.append("mammoth")
|
||||
if PANDOC_AVAILABLE:
|
||||
available_methods.append("pandoc")
|
||||
if DOCX_AVAILABLE:
|
||||
available_methods.append("python-docx")
|
||||
|
||||
if available_methods:
|
||||
print(f"Available conversion methods: {', '.join(available_methods)}")
|
||||
else:
|
||||
print("No conversion libraries available!")
|
||||
print("Install with: pip install mammoth pypandoc python-docx markdownify")
|
||||
|
||||
|
||||
def clean_markdown_text(text: str) -> str:
|
||||
"""
|
||||
Clean and format markdown text using better approaches than manual regex.
|
||||
|
||||
This function addresses common formatting issues:
|
||||
- Multiple whitespaces
|
||||
- Inconsistent list formatting
|
||||
- Paragraph spacing
|
||||
- Line break normalization
|
||||
"""
|
||||
|
||||
# 1. Normalize line endings
|
||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
|
||||
# 2. Fix multiple consecutive spaces (but preserve code blocks)
|
||||
lines = text.split("\n")
|
||||
cleaned_lines = []
|
||||
in_code_block = False
|
||||
|
||||
for line in lines:
|
||||
# Check if we're entering/exiting a code block
|
||||
if line.strip().startswith("```"):
|
||||
in_code_block = not in_code_block
|
||||
cleaned_lines.append(line)
|
||||
continue
|
||||
|
||||
if not in_code_block:
|
||||
# Replace multiple spaces with single space, but preserve indentation
|
||||
leading_spaces = len(line) - len(line.lstrip())
|
||||
content = line.lstrip()
|
||||
# Collapse multiple spaces in content
|
||||
content = re.sub(r" {2,}", " ", content)
|
||||
line = " " * leading_spaces + content
|
||||
|
||||
cleaned_lines.append(line)
|
||||
|
||||
text = "\n".join(cleaned_lines)
|
||||
|
||||
# 3. Fix list formatting
|
||||
# Ensure proper spacing for unordered lists
|
||||
text = re.sub(r"^(\s*)[-*+](\S)", r"\1- \2", text, flags=re.MULTILINE)
|
||||
|
||||
# Ensure proper spacing for ordered lists
|
||||
text = re.sub(r"^(\s*)(\d+\.)(\S)", r"\1\2 \3", text, flags=re.MULTILINE)
|
||||
|
||||
# 4. Fix paragraph spacing - remove excessive blank lines
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
|
||||
# 5. Clean up common punctuation issues (more targeted than your original)
|
||||
# Fix missing space after punctuation at end of sentences
|
||||
text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)
|
||||
|
||||
# Fix missing space after commas (but not in numbers)
|
||||
text = re.sub(r",(?=[a-zA-Z])", ", ", text)
|
||||
|
||||
# Fix missing space after colons and semicolons
|
||||
text = re.sub(r"([;:])(?=[a-zA-Z])", r"\1 ", text)
|
||||
|
||||
# Remove spaces before punctuation
|
||||
text = re.sub(r" +([,.;:!?])", r"\1", text)
|
||||
|
||||
# 6. Clean up markdown formatting issues
|
||||
# Fix bold/italic markers with spaces
|
||||
text = re.sub(r"\*\* +", "**", text)
|
||||
text = re.sub(r" +\*\*", "**", text)
|
||||
text = re.sub(r"\* +", "*", text)
|
||||
text = re.sub(r" +\*", "*", text)
|
||||
|
||||
# 7. Final cleanup
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def convert_docx_with_mammoth(docx_path: Path) -> str:
|
||||
"""Convert DOCX to Markdown using mammoth (recommended approach)."""
|
||||
try:
|
||||
with open(docx_path, "rb") as docx_file:
|
||||
result = mammoth.convert_to_markdown(docx_file)
|
||||
if result.messages:
|
||||
print(f"Mammoth warnings: {result.messages}")
|
||||
return result.value
|
||||
except Exception as e:
|
||||
print(f"Mammoth conversion error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def convert_docx_with_pandoc(docx_path: Path) -> str:
|
||||
"""Convert DOCX to Markdown using pandoc (alternative approach)."""
|
||||
return pypandoc.convert_file(str(docx_path), "md", format="docx")
|
||||
|
||||
|
||||
def convert_docx_with_python_docx(docx_path: Path) -> str:
|
||||
"""Convert DOCX to Markdown using python-docx (basic fallback)."""
|
||||
doc = Document(docx_path)
|
||||
|
||||
content = []
|
||||
for paragraph in doc.paragraphs:
|
||||
# Simple paragraph to markdown conversion
|
||||
text = paragraph.text.strip()
|
||||
if text:
|
||||
# Basic heading detection (you might want to improve this)
|
||||
if paragraph.style.name.startswith("Heading"):
|
||||
level = (
|
||||
int(paragraph.style.name.split()[-1])
|
||||
if paragraph.style.name.split()[-1].isdigit()
|
||||
else 1
|
||||
)
|
||||
content.append("#" * level + " " + text)
|
||||
else:
|
||||
content.append(text)
|
||||
content.append("") # Add blank line after each paragraph
|
||||
|
||||
# Handle tables
|
||||
for table in doc.tables:
|
||||
content.append("") # Add blank line before table
|
||||
for i, row in enumerate(table.rows):
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
content.append("| " + " | ".join(row_data) + " |")
|
||||
if i == 0: # Add separator after header
|
||||
content.append("|" + "---|" * len(row_data))
|
||||
content.append("") # Add blank line after table
|
||||
|
||||
return "\n".join(content)
|
||||
|
||||
|
||||
def convert_single_docx(
|
||||
input_path: Path, output_dir: Path, method: str = "mammoth"
|
||||
) -> bool:
|
||||
"""
|
||||
Convert a single DOCX file to Markdown.
|
||||
|
||||
Args:
|
||||
input_path: Path to the DOCX file
|
||||
output_dir: Directory to save the Markdown file
|
||||
method: Conversion method ('mammoth', 'pandoc', or 'python-docx')
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
print(f"Converting: {input_path.name}")
|
||||
|
||||
# Choose conversion method
|
||||
if method == "mammoth" and MAMMOTH_AVAILABLE:
|
||||
markdown_content = convert_docx_with_mammoth(input_path)
|
||||
elif method == "pandoc" and PANDOC_AVAILABLE:
|
||||
markdown_content = convert_docx_with_pandoc(input_path)
|
||||
elif method == "python-docx" and DOCX_AVAILABLE:
|
||||
markdown_content = convert_docx_with_python_docx(input_path)
|
||||
else:
|
||||
print(f"Method '{method}' not available. Trying alternatives...")
|
||||
if MAMMOTH_AVAILABLE:
|
||||
markdown_content = convert_docx_with_mammoth(input_path)
|
||||
elif PANDOC_AVAILABLE:
|
||||
markdown_content = convert_docx_with_pandoc(input_path)
|
||||
elif DOCX_AVAILABLE:
|
||||
markdown_content = convert_docx_with_python_docx(input_path)
|
||||
else:
|
||||
print("No conversion libraries available!")
|
||||
return False
|
||||
|
||||
# Clean and format the markdown
|
||||
cleaned_content = clean_markdown_text(markdown_content)
|
||||
|
||||
# Create output file path
|
||||
output_path = output_dir / f"{input_path.stem}.md"
|
||||
|
||||
# Write the markdown file
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(cleaned_content)
|
||||
|
||||
print(f"✓ Saved: {output_path}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error converting {input_path.name}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def convert_directory(
|
||||
input_dir: str, output_dir: str, method: str = "mammoth", recursive: bool = False
|
||||
):
|
||||
"""
|
||||
Convert all DOCX files in a directory to Markdown.
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing DOCX files
|
||||
output_dir: Directory to save Markdown files
|
||||
method: Conversion method to use
|
||||
recursive: Whether to search subdirectories
|
||||
"""
|
||||
input_path = Path(input_dir)
|
||||
output_path = Path(output_dir)
|
||||
|
||||
# Validate input directory
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input directory '{input_dir}' does not exist.")
|
||||
return
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Find all DOCX files
|
||||
if recursive:
|
||||
docx_files = list(input_path.rglob("*.docx"))
|
||||
else:
|
||||
docx_files = list(input_path.glob("*.docx"))
|
||||
|
||||
# Filter out temporary files (start with ~$)
|
||||
docx_files = [f for f in docx_files if not f.name.startswith("~$")]
|
||||
|
||||
if not docx_files:
|
||||
print("No DOCX files found in the specified directory.")
|
||||
return
|
||||
|
||||
print(f"Found {len(docx_files)} DOCX files to convert...")
|
||||
|
||||
# Convert each file
|
||||
successful = 0
|
||||
failed = 0
|
||||
|
||||
for docx_file in docx_files:
|
||||
if convert_single_docx(docx_file, output_path, method):
|
||||
successful += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
print(f"\nConversion complete:")
|
||||
print(f"✓ Successful: {successful}")
|
||||
print(f"✗ Failed: {failed}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert DOCX documents to Markdown format",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python docx_to_md.py /path/to/docx/files /path/to/output
|
||||
python docx_to_md.py ./documents ./markdown --method pandoc --recursive
|
||||
python docx_to_md.py ~/docs ~/markdown -r
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("input_dir", help="Directory containing DOCX files")
|
||||
parser.add_argument("output_dir", help="Directory to save Markdown files")
|
||||
parser.add_argument(
|
||||
"--method",
|
||||
"-m",
|
||||
choices=["mammoth", "pandoc", "python-docx"],
|
||||
default="mammoth",
|
||||
help="Conversion method (default: mammoth)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--recursive",
|
||||
"-r",
|
||||
action="store_true",
|
||||
help="Search subdirectories recursively",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("DOCX to Markdown Converter")
|
||||
print("=" * 30)
|
||||
print(f"Input directory: {args.input_dir}")
|
||||
print(f"Output directory: {args.output_dir}")
|
||||
print(f"Method: {args.method}")
|
||||
print(f"Recursive: {args.recursive}")
|
||||
print()
|
||||
|
||||
convert_directory(args.input_dir, args.output_dir, args.method, args.recursive)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user