#!/usr/bin/env python3 import os import re import zipfile import subprocess import xml.etree.ElementTree as ET import shutil import json from pathlib import Path import difflib import hashlib def extract_first_line_docx(docx_file): """Extract the first line of text from a docx file using built-in libraries.""" try: # Open the docx file as a zip archive with zipfile.ZipFile(docx_file) as zip_ref: # Extract document.xml which contains the main content xml_content = zip_ref.read("word/document.xml") # Parse the XML root = ET.fromstring(xml_content) # Define namespace namespaces = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" } # Find all text elements text_elements = root.findall(".//w:t", namespaces) # Concatenate text from first paragraph with content paragraph_text = "" for element in text_elements: text = element.text if text and text.strip(): paragraph_text += text # If we have some text and hit a line break or period, that's good enough for a first line if "\n" in paragraph_text or "." in paragraph_text: break # Clean and return the first line if found if paragraph_text.strip(): return paragraph_text.strip() return "NoTextFound" except Exception as e: print(f"Error reading {docx_file}: {e}") return "ErrorReadingFile" def extract_first_line_doc(doc_file): """Extract the first line of text from a doc file using antiword if available, otherwise return a placeholder.""" try: # Check if antiword is installed result = subprocess.run( ["which", "antiword"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if result.returncode == 0: # Use antiword to extract text result = subprocess.run( ["antiword", str(doc_file)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) if result.returncode == 0: text = result.stdout # Get the first non-empty line for line in text.split("\n"): if line.strip(): return line.strip() else: print(f"Antiword error: {result.stderr}") else: # Try catdoc if available result = subprocess.run( ["which", "catdoc"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if result.returncode == 0: result = subprocess.run( ["catdoc", str(doc_file)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) if result.returncode == 0: text = result.stdout # Get the first non-empty line for line in text.split("\n"): if line.strip(): return line.strip() else: print(f"Catdoc error: {result.stderr}") else: print( "Neither antiword nor catdoc is installed. Cannot extract text from .doc files." ) return f"Doc_FileNeedsConversion_{Path(doc_file).stem}" return "NoTextFound" except Exception as e: print(f"Error reading {doc_file}: {e}") return "ErrorReadingFile" def clean_title(title): """Clean title by removing various prefixes and formatting.""" # Remove leading numbers and dots (like "15 ", "3.", "8", etc.) title = re.sub(r"^\d+\.?\s*", "", title) # More aggressive pattern to match different variations of "Тема: N" or "Тема N" title = re.sub(r"^[Тт]ема[\s:]*\d+\s*", "", title) # Remove "copy" at the end title = re.sub(r"\s+copy$", "", title) # Remove extra spaces title = re.sub(r"\s+", " ", title).strip() return title def calculate_file_hash(filepath): """Calculate MD5 hash of a file to compare content.""" hasher = hashlib.md5() with open(filepath, "rb") as f: buf = f.read(65536) # Read in 64k chunks while len(buf) > 0: hasher.update(buf) buf = f.read(65536) return hasher.hexdigest() def find_similar_files(files): """Group similar files to detect originals and copies.""" # First pass: group by filename similarity filename_groups = {} for file_path in files: filename = file_path.name # Remove "copy" for comparison base_name = re.sub(r"\s+copy\.[^.]+$", "", filename) base_name = re.sub(r"\s+copy$", "", base_name) if base_name not in filename_groups: filename_groups[base_name] = [] filename_groups[base_name].append(file_path) # Filter groups to keep only those with multiple files duplicate_groups = {k: v for k, v in filename_groups.items() if len(v) > 1} # Second pass: verify content similarity using hashes verified_duplicates = {} for base_name, file_paths in duplicate_groups.items(): hashes = {} for file_path in file_paths: file_hash = calculate_file_hash(file_path) if file_hash not in hashes: hashes[file_hash] = [] hashes[file_hash].append(file_path) # Keep groups with identical content identical_files = [ files for hash_val, files in hashes.items() if len(files) > 1 ] if identical_files: verified_duplicates[base_name] = identical_files # Also track files with the same name but different content if len(hashes) > 1: print( f"Warning: Files with similar names but different content: {file_paths}" ) return verified_duplicates, duplicate_groups def main(): # Directory containing the doc/docx files data_dir = "./data" # Create backup and duplicates directories backup_dir = os.path.join(data_dir, "docs_backup") duplicates_dir = os.path.join(data_dir, "duplicates") os.makedirs(backup_dir, exist_ok=True) os.makedirs(duplicates_dir, exist_ok=True) # Get all doc and docx files in the data directory docs_path = Path(data_dir) doc_files = list(docs_path.glob("*.doc")) docx_files = list(docs_path.glob("*.docx")) all_files = doc_files + docx_files if not all_files: print(f"No doc/docx files found in {data_dir}") return print(f"Found {len(all_files)} documents") # First, let's handle duplicates print("\nChecking for duplicate files...") verified_duplicates, potential_duplicates = find_similar_files(all_files) # Move duplicate files to the duplicates directory files_to_remove = [] for base_name, duplicate_groups in verified_duplicates.items(): for duplicate_group in duplicate_groups: # Keep the first file, move others to duplicates directory keeper = duplicate_group[0] for dupe in duplicate_group[1:]: print( f"Moving duplicate: {dupe.name} → duplicates/ (identical to {keeper.name})" ) shutil.move(dupe, os.path.join(duplicates_dir, dupe.name)) files_to_remove.append(dupe) # For potential duplicates with "copy" in the name for base_name, file_paths in potential_duplicates.items(): # Check if any have "copy" in the name copy_files = [f for f in file_paths if "copy" in f.name.lower()] non_copy_files = [f for f in file_paths if "copy" not in f.name.lower()] # If we have both originals and copies if copy_files and non_copy_files: for copy_file in copy_files: # Check if we already moved this as a verified duplicate if copy_file in files_to_remove: continue print( f"Moving potential duplicate: {copy_file.name} → duplicates/ (has 'copy' in name)" ) shutil.move(copy_file, os.path.join(duplicates_dir, copy_file.name)) files_to_remove.append(copy_file) # Remove duplicates from the processing list for file_path in files_to_remove: if file_path in all_files: all_files.remove(file_path) # Now let's process the remaining files print(f"\nProcessing {len(all_files)} unique files...") # Sort files by name for consistent processing all_files.sort(key=lambda x: x.name) # Dictionary to store full titles title_map = {} # Process each file for counter, filepath in enumerate(all_files, 1): filename = filepath.name # Skip files in the backup/duplicates directories if any(d in str(filepath) for d in ["docs_backup", "duplicates"]): continue print(f"Processing {filename}...") # Get first line of content based on file type if filepath.suffix.lower() == ".docx": first_line = extract_first_line_docx(filepath) else: # .doc first_line = extract_first_line_doc(filepath) # Clean the title clean_full_title = clean_title(first_line) # Format counter with leading zeros counter_str = f"{counter:03d}" # Store the full title in the dictionary with counter as key title_map[counter_str] = clean_full_title # Create new filename with requested format: "TXXX.extension" new_filename = f"T{counter_str}{filepath.suffix}" new_filepath = filepath.parent / new_filename # Create a backup backup_path = Path(backup_dir) / filename shutil.copy2(filepath, backup_path) print(f"Renaming: {filename} → {new_filename}") # Rename the file filepath.rename(new_filepath) # Save the full titles to a JSON file json_path = os.path.join(data_dir, "document_titles.json") with open(json_path, "w", encoding="utf-8") as json_file: json.dump(title_map, json_file, ensure_ascii=False, indent=2) print(f"\nAll files renamed. Backups stored in '{backup_dir}' directory.") print(f"Duplicate files moved to '{duplicates_dir}' directory.") print(f"Full titles saved to '{json_path}'") # Print installation instructions if needed if doc_files: print( "\nNOTE: To process .doc files, you may need to install one of these tools:" ) print(" - antiword: sudo apt-get install antiword") print(" - catdoc: sudo apt-get install catdoc") if __name__ == "__main__": main()