import mammoth import os import pathlib import re from bs4 import BeautifulSoup def format_text(text): """ Format text with proper spacing around punctuation. Rules: - No space before punctuation marks (;:,.) - One space after punctuation marks - Preserve line breaks and multiple spaces in specific contexts - Preserve commas in decimal numbers """ # Handle decimal numbers first (preserve commas in numbers like 3,14) # This finds decimal numbers and temporarily replaces the comma with a placeholder def protect_decimal_comma(match): return match.group(0).replace(",", "##DECIMAL##") # Find numbers with commas (e.g. 3,14) and protect them text = re.sub(r"(\d+),(\d+)", protect_decimal_comma, text) # Fix colon followed immediately by text (no space) text = re.sub(r":(\w)", r": \1", text) # Fix semicolon followed immediately by text (no space) text = re.sub(r";(\w)", r"; \1", text) # Fix comma followed immediately by text (no space) text = re.sub(r",(\w)", r", \1", text) # Fix period followed immediately by text (no space) - only if not part of an abbreviation text = re.sub(r"\.(\w)", r". \1", text) # Fix spaces before punctuation text = re.sub(r"\s+([;:,.)])", r"\1", text) # Restore the decimal commas text = text.replace("##DECIMAL##", ",") return text def convert_docx_to_html( docx_path, output_path, apply_formatting=True, remove_first_strong=True ): with open(docx_path, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value # Remove the first tag content inside a

tag if requested if remove_first_strong: # Find and remove only the first occurrence of a

tag containing only a tag html = re.sub(r"

\s*(.*?)\s*

", "", html, count=1) # Clean up any empty paragraph tags that might be left html = re.sub(r"

\s*

", "", html) if apply_formatting: # Use BeautifulSoup for better HTML parsing soup = BeautifulSoup(html, "html.parser") # Process text nodes only for text_node in soup.find_all(text=True): if text_node.parent.name not in ["pre", "code", "script", "style"]: new_text = format_text(text_node.string) text_node.replace_with(new_text) html = str(soup) # Write the HTML to a file with open(output_path, "w", encoding="utf-8") as html_file: html_file.write(html) return output_path def convert_all_docx_files( input_dir, output_dir, apply_formatting=True, remove_first_strong=True ): # Create output directory if it doesn't exist pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) # Get all docx files in the input directory docx_files = [f for f in os.listdir(input_dir) if f.endswith(".docx")] # Convert each docx file to html for docx_file in docx_files: docx_path = os.path.join(input_dir, docx_file) html_file = os.path.splitext(docx_file)[0] + ".html" html_path = os.path.join(output_dir, html_file) print(f"Converting {docx_path} to {html_path}") convert_docx_to_html( docx_path, html_path, apply_formatting, remove_first_strong ) print(f"Converted {len(docx_files)} files from {input_dir} to {output_dir}") def post_process_html_files(directory): """ Process already converted HTML files to fix punctuation spacing. This is useful for fixing existing files without re-converting from DOCX. """ html_files = [f for f in os.listdir(directory) if f.endswith(".html")] for html_file in html_files: file_path = os.path.join(directory, html_file) print(f"Post-processing {file_path}") # Read the HTML file with open(file_path, "r", encoding="utf-8") as file: html = file.read() # Use BeautifulSoup for better HTML parsing soup = BeautifulSoup(html, "html.parser") # Process text nodes only for text_node in soup.find_all(text=True): if text_node.parent.name not in ["pre", "code", "script", "style"]: new_text = format_text(text_node.string) text_node.replace_with(new_text) # Write the processed HTML back to the file with open(file_path, "w", encoding="utf-8") as file: file.write(str(soup)) print(f"Post-processed {len(html_files)} HTML files in {directory}") # Run the conversion if __name__ == "__main__": input_directory = "./data" output_directory = "./data_html" # Option 1: Convert DOCX to HTML convert_all_docx_files( input_directory, output_directory, apply_formatting=True, remove_first_strong=True, ) # Option 2: Post-process existing HTML files # Uncomment the line below to use this option # post_process_html_files(output_directory)