142 lines
4.1 KiB
Python
142 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
import os
|
|
import glob
|
|
import re
|
|
import zipfile
|
|
import subprocess
|
|
import xml.etree.ElementTree as ET
|
|
import shutil
|
|
import json
|
|
from pathlib import Path
|
|
import difflib
|
|
import hashlib
|
|
import docx
|
|
from docx2pdf import convert
|
|
|
|
|
|
def clean_string(input_string):
|
|
|
|
# Remove everything up to and including the first numbers and any period that follows,
|
|
# then any whitespace
|
|
result = re.sub(r"^.*?(\d+)\.?\s*", "", input_string)
|
|
|
|
# Fix spacing around punctuation (periods, colons, commas)
|
|
# Add space after punctuation if followed by non-whitespace
|
|
result = re.sub(r"([.,:])\s*([^\s.,:])", r"\1 \2", result)
|
|
|
|
# Remove space before punctuation
|
|
result = re.sub(r"\s+([.,:])", r"\1", result)
|
|
|
|
# Replace multiple consecutive punctuation marks with a single one
|
|
result = re.sub(r"\.{2,}", ".", result)
|
|
result = re.sub(r":{2,}", ":", result)
|
|
result = re.sub(r",{2,}", ",", result)
|
|
|
|
# Remove ending dot
|
|
result = re.sub(r"\.$", "", result)
|
|
|
|
# Replace multiple whitespaces with a single space
|
|
result = re.sub(r"\s{2,}", " ", result)
|
|
|
|
result = re.sub(r"^\.?\s*тема:\s*", "", result)
|
|
|
|
# Trim leading and trailing whitespace
|
|
result = result.strip()
|
|
|
|
return result
|
|
|
|
|
|
def read_first_line_docx(filename):
|
|
try:
|
|
doc = docx.Document(filename)
|
|
|
|
# Check if document has any content
|
|
if not doc.paragraphs:
|
|
return "Document contains no text"
|
|
|
|
# Look for the first non-empty paragraph
|
|
for i, para in enumerate(doc.paragraphs[:5]): # Check first 5 paragraphs
|
|
text = para.text.strip()
|
|
if text:
|
|
# Check if this paragraph has title-like formatting
|
|
is_title = False
|
|
for run in para.runs:
|
|
if run.bold or run.font.size and run.font.size > 12:
|
|
is_title = True
|
|
break
|
|
|
|
# Return this paragraph if it's formatted like a title or if it's the first with text
|
|
if is_title or i == 0:
|
|
# Split by newlines and take first line
|
|
first_line = text.split("\n")[0].strip()
|
|
return clean_string(first_line)
|
|
|
|
# If we reach here, use the first paragraph that has any text
|
|
for para in doc.paragraphs:
|
|
if para.text.strip():
|
|
return clean_string(para.text.split("\n")[0].strip())
|
|
|
|
return "Document contains no text"
|
|
|
|
except Exception as e:
|
|
return f"Error reading document: {str(e)}"
|
|
|
|
|
|
def group_duplicates(items):
|
|
grouped = {}
|
|
|
|
# Group by title
|
|
for item in items:
|
|
title = item["title"]
|
|
if title not in grouped:
|
|
grouped[title] = []
|
|
grouped[title].append(item)
|
|
|
|
# Convert to list of groups
|
|
result = list(grouped.values())
|
|
return result
|
|
|
|
|
|
def main():
|
|
text = []
|
|
|
|
data_dir = "./data"
|
|
filenames = os.listdir(data_dir)
|
|
for i, filename in enumerate(filenames):
|
|
filepath = os.path.join(data_dir, filename)
|
|
first_line = read_first_line_docx(filepath)
|
|
text.append({"filename": filename, "title": first_line})
|
|
|
|
final_structure = []
|
|
grouped = group_duplicates(text)
|
|
for i, g in enumerate(grouped, 1):
|
|
final_structure.append(
|
|
{
|
|
"id": f"T{i:03d}",
|
|
"title": g[0]["title"],
|
|
"files": [t["filename"].replace("docx", "pdf") for t in g],
|
|
}
|
|
)
|
|
with open("structure.json", "w", encoding="utf-8") as json_file:
|
|
json.dump(final_structure, json_file, ensure_ascii=False, indent=4)
|
|
|
|
|
|
def rename_files():
|
|
docx_files = glob.glob("./data/*.docx")
|
|
|
|
# Sort the files to ensure consistent numbering
|
|
docx_files.sort()
|
|
|
|
# Rename each file with a counter
|
|
for i, file_path in enumerate(docx_files, 1):
|
|
# Create the new filename with 3 digits (001, 002, etc.)
|
|
new_filename = f"./data/f{i:03d}.docx"
|
|
|
|
# Rename the file
|
|
os.rename(file_path, new_filename)
|
|
print(f"Renamed: {file_path} -> {new_filename}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|