MacOS_Parsers/Processing/0_02_слияние_всех эксель файлов ZARAHOME.py

import os
import sys
import traceback
import pandas as pd
from datetime import datetime

# === Вспомогательное ===
def get_script_dir() -> str:
    try:
        return os.path.dirname(os.path.abspath(__file__))
    except NameError:
        return os.getcwd()

def log(msg: str):
    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    line = f'[{ts}] {msg}'
    print(line)
    try:
        with open(log_file, 'a', encoding='utf-8') as f:
            f.write(line + '\n')
    except Exception:
        pass

def is_temp_or_hidden(name: str) -> bool:
    return name.startswith('~$') or name.startswith('.')

# === Пути ===
script_dir = get_script_dir()
folder_path = os.path.join(script_dir, 'Files-todo')

timestamp = datetime.now().strftime('%Y%m%d-%H%M')
output_filename = f'All-todo-{timestamp}.xlsx'
output_file = os.path.join(folder_path, output_filename)

log_file = os.path.join(folder_path, 'merge_log.txt')

BASE_URL = "https://www.zarahome.com/pl/en/"

def main():
    log('=== Старт объединения Excel-файлов ===')
    log(f'Папка: {folder_path}')

    if not os.path.isdir(folder_path):
        log(f'ОШИБКА: Папка не найдена: {folder_path}')
        sys.exit(1)

    names = sorted(os.listdir(folder_path))
    files = []
    for name in names:
        if is_temp_or_hidden(name):
            continue
        if not name.lower().endswith('.xlsx'):
            continue
        if name.startswith('All-todo-'):
            continue
        full = os.path.join(folder_path, name)
        if os.path.isfile(full):
            files.append(full)

    if not files:
        log('Нет входных .xlsx файлов для слияния. Завершение.')
        return

    log(f'Найдено файлов: {len(files)}')

    dfs = []
    all_columns = []
    seen_cols = set()
    processed = 0
    errors = 0

    for i, path in enumerate(files, start=1):
        fname = os.path.basename(path)
        try:
            df = pd.read_excel(path, engine='openpyxl')
            if df is None or df.empty:
                log(f'ПРЕДУПРЕЖДЕНИЕ: Пустой файл — пропуск: {fname}')
                continue

            if not all_columns:
                for c in df.columns:
                    if c not in seen_cols:
                        all_columns.append(c)
                        seen_cols.add(c)

            # Добавляем источники
            stem, _ = os.path.splitext(fname)
            df['SourceFile'] = fname
            df['SourceFileUrl'] = BASE_URL + stem

            dfs.append(df)
            processed += 1
            log(f'[{i}/{len(files)}] OK: {fname} — строк: {len(df)}')
        except Exception as e:
            errors += 1
            log(f'[{i}/{len(files)}] ОШИБКА: {fname}: {e}')
            log(traceback.format_exc())

    if not dfs:
        log('Все файлы пустые или не прочитаны — нечего сохранять.')
        return

    combined = pd.concat(dfs, ignore_index=True)

    # порядок колонок: сначала из первого удачного файла, потом прочие (кроме служебных) по алфавиту,
    # затем служебные SourceFile и SourceFileUrl в конце
    extra_cols = [c for c in combined.columns if c not in all_columns and c not in ('SourceFile', 'SourceFileUrl')]
    extra_cols.sort()
    final_cols = all_columns + extra_cols
    if 'SourceFile' not in final_cols:
        final_cols.append('SourceFile')
    if 'SourceFileUrl' not in final_cols:
        final_cols.append('SourceFileUrl')

    combined = combined.reindex(columns=final_cols)

    try:
        with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
            combined.to_excel(writer, index=False, sheet_name='Sheet')
        log(f'Готово: {output_file}')
        log(f'Итоговых строк: {len(combined)}')
        log(f'Успешно обработано файлов: {processed}, ошибок: {errors}')
    except Exception as e:
        log(f'ОШИБКА сохранения {output_file}: {e}')
        log(traceback.format_exc())

    log('=== Готово ===')

if __name__ == '__main__':
    main()