#!/usr/bin/env python3 import os import glob import re import zipfile import subprocess import xml.etree.ElementTree as ET import shutil import json from pathlib import Path import difflib import hashlib import docx from docx2pdf import convert def clean_string(input_string): # Remove everything up to and including the first numbers and any period that follows, # then any whitespace result = re.sub(r"^.*?(\d+)\.?\s*", "", input_string) # Fix spacing around punctuation (periods, colons, commas) # Add space after punctuation if followed by non-whitespace result = re.sub(r"([.,:])\s*([^\s.,:])", r"\1 \2", result) # Remove space before punctuation result = re.sub(r"\s+([.,:])", r"\1", result) # Replace multiple consecutive punctuation marks with a single one result = re.sub(r"\.{2,}", ".", result) result = re.sub(r":{2,}", ":", result) result = re.sub(r",{2,}", ",", result) # Remove ending dot result = re.sub(r"\.$", "", result) # Replace multiple whitespaces with a single space result = re.sub(r"\s{2,}", " ", result) result = re.sub(r"^\.?\s*тема:\s*", "", result) # Trim leading and trailing whitespace result = result.strip() return result def read_first_line_docx(filename): try: doc = docx.Document(filename) # Check if document has any content if not doc.paragraphs: return "Document contains no text" # Look for the first non-empty paragraph for i, para in enumerate(doc.paragraphs[:5]): # Check first 5 paragraphs text = para.text.strip() if text: # Check if this paragraph has title-like formatting is_title = False for run in para.runs: if run.bold or run.font.size and run.font.size > 12: is_title = True break # Return this paragraph if it's formatted like a title or if it's the first with text if is_title or i == 0: # Split by newlines and take first line first_line = text.split("\n")[0].strip() return clean_string(first_line) # If we reach here, use the first paragraph that has any text for para in doc.paragraphs: if para.text.strip(): return clean_string(para.text.split("\n")[0].strip()) return "Document contains no text" except Exception as e: return f"Error reading document: {str(e)}" def group_duplicates(items): grouped = {} # Group by title for item in items: title = item["title"] if title not in grouped: grouped[title] = [] grouped[title].append(item) # Convert to list of groups result = list(grouped.values()) return result def main(): text = [] data_dir = "./data" filenames = os.listdir(data_dir) for i, filename in enumerate(filenames): filepath = os.path.join(data_dir, filename) first_line = read_first_line_docx(filepath) text.append({"filename": filename, "title": first_line}) final_structure = [] grouped = group_duplicates(text) for i, g in enumerate(grouped, 1): final_structure.append( { "id": f"T{i:03d}", "title": g[0]["title"], "files": [t["filename"].replace("docx", "pdf") for t in g], } ) with open("structure.json", "w", encoding="utf-8") as json_file: json.dump(final_structure, json_file, ensure_ascii=False, indent=4) def rename_files(): docx_files = glob.glob("./data/*.docx") # Sort the files to ensure consistent numbering docx_files.sort() # Rename each file with a counter for i, file_path in enumerate(docx_files, 1): # Create the new filename with 3 digits (001, 002, etc.) new_filename = f"./data/f{i:03d}.docx" # Rename the file os.rename(file_path, new_filename) print(f"Renamed: {file_path} -> {new_filename}") if __name__ == "__main__": main()