update
This commit is contained in:
141
data_structure/analyze.py
Normal file
141
data_structure/analyze.py
Normal file
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import glob
|
||||
import re
|
||||
import zipfile
|
||||
import subprocess
|
||||
import xml.etree.ElementTree as ET
|
||||
import shutil
|
||||
import json
|
||||
from pathlib import Path
|
||||
import difflib
|
||||
import hashlib
|
||||
import docx
|
||||
from docx2pdf import convert
|
||||
|
||||
|
||||
def clean_string(input_string):
|
||||
|
||||
# Remove everything up to and including the first numbers and any period that follows,
|
||||
# then any whitespace
|
||||
result = re.sub(r"^.*?(\d+)\.?\s*", "", input_string)
|
||||
|
||||
# Fix spacing around punctuation (periods, colons, commas)
|
||||
# Add space after punctuation if followed by non-whitespace
|
||||
result = re.sub(r"([.,:])\s*([^\s.,:])", r"\1 \2", result)
|
||||
|
||||
# Remove space before punctuation
|
||||
result = re.sub(r"\s+([.,:])", r"\1", result)
|
||||
|
||||
# Replace multiple consecutive punctuation marks with a single one
|
||||
result = re.sub(r"\.{2,}", ".", result)
|
||||
result = re.sub(r":{2,}", ":", result)
|
||||
result = re.sub(r",{2,}", ",", result)
|
||||
|
||||
# Remove ending dot
|
||||
result = re.sub(r"\.$", "", result)
|
||||
|
||||
# Replace multiple whitespaces with a single space
|
||||
result = re.sub(r"\s{2,}", " ", result)
|
||||
|
||||
result = re.sub(r"^\.?\s*тема:\s*", "", result)
|
||||
|
||||
# Trim leading and trailing whitespace
|
||||
result = result.strip()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def read_first_line_docx(filename):
|
||||
try:
|
||||
doc = docx.Document(filename)
|
||||
|
||||
# Check if document has any content
|
||||
if not doc.paragraphs:
|
||||
return "Document contains no text"
|
||||
|
||||
# Look for the first non-empty paragraph
|
||||
for i, para in enumerate(doc.paragraphs[:5]): # Check first 5 paragraphs
|
||||
text = para.text.strip()
|
||||
if text:
|
||||
# Check if this paragraph has title-like formatting
|
||||
is_title = False
|
||||
for run in para.runs:
|
||||
if run.bold or run.font.size and run.font.size > 12:
|
||||
is_title = True
|
||||
break
|
||||
|
||||
# Return this paragraph if it's formatted like a title or if it's the first with text
|
||||
if is_title or i == 0:
|
||||
# Split by newlines and take first line
|
||||
first_line = text.split("\n")[0].strip()
|
||||
return clean_string(first_line)
|
||||
|
||||
# If we reach here, use the first paragraph that has any text
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
return clean_string(para.text.split("\n")[0].strip())
|
||||
|
||||
return "Document contains no text"
|
||||
|
||||
except Exception as e:
|
||||
return f"Error reading document: {str(e)}"
|
||||
|
||||
|
||||
def group_duplicates(items):
|
||||
grouped = {}
|
||||
|
||||
# Group by title
|
||||
for item in items:
|
||||
title = item["title"]
|
||||
if title not in grouped:
|
||||
grouped[title] = []
|
||||
grouped[title].append(item)
|
||||
|
||||
# Convert to list of groups
|
||||
result = list(grouped.values())
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
text = []
|
||||
|
||||
data_dir = "./data"
|
||||
filenames = os.listdir(data_dir)
|
||||
for i, filename in enumerate(filenames):
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
first_line = read_first_line_docx(filepath)
|
||||
text.append({"filename": filename, "title": first_line})
|
||||
|
||||
final_structure = []
|
||||
grouped = group_duplicates(text)
|
||||
for i, g in enumerate(grouped, 1):
|
||||
final_structure.append(
|
||||
{
|
||||
"id": f"T{i:03d}",
|
||||
"title": g[0]["title"],
|
||||
"files": [t["filename"].replace("docx", "pdf") for t in g],
|
||||
}
|
||||
)
|
||||
with open("structure.json", "w", encoding="utf-8") as json_file:
|
||||
json.dump(final_structure, json_file, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
def rename_files():
|
||||
docx_files = glob.glob("./data/*.docx")
|
||||
|
||||
# Sort the files to ensure consistent numbering
|
||||
docx_files.sort()
|
||||
|
||||
# Rename each file with a counter
|
||||
for i, file_path in enumerate(docx_files, 1):
|
||||
# Create the new filename with 3 digits (001, 002, etc.)
|
||||
new_filename = f"./data/f{i:03d}.docx"
|
||||
|
||||
# Rename the file
|
||||
os.rename(file_path, new_filename)
|
||||
print(f"Renamed: {file_path} -> {new_filename}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user