MacOS_Parsers/Processing/0_02_слияние_всех эксель файлов ZARAHOME.py
2025-08-15 11:36:08 +03:00

131 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
import traceback
import pandas as pd
from datetime import datetime
# === Вспомогательное ===
def get_script_dir() -> str:
try:
return os.path.dirname(os.path.abspath(__file__))
except NameError:
return os.getcwd()
def log(msg: str):
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
line = f'[{ts}] {msg}'
print(line)
try:
with open(log_file, 'a', encoding='utf-8') as f:
f.write(line + '\n')
except Exception:
pass
def is_temp_or_hidden(name: str) -> bool:
return name.startswith('~$') or name.startswith('.')
# === Пути ===
script_dir = get_script_dir()
folder_path = os.path.join(script_dir, 'Files-todo')
timestamp = datetime.now().strftime('%Y%m%d-%H%M')
output_filename = f'All-todo-{timestamp}.xlsx'
output_file = os.path.join(folder_path, output_filename)
log_file = os.path.join(folder_path, 'merge_log.txt')
BASE_URL = "https://www.zarahome.com/pl/en/"
def main():
log('=== Старт объединения Excel-файлов ===')
log(f'Папка: {folder_path}')
if not os.path.isdir(folder_path):
log(f'ОШИБКА: Папка не найдена: {folder_path}')
sys.exit(1)
names = sorted(os.listdir(folder_path))
files = []
for name in names:
if is_temp_or_hidden(name):
continue
if not name.lower().endswith('.xlsx'):
continue
if name.startswith('All-todo-'):
continue
full = os.path.join(folder_path, name)
if os.path.isfile(full):
files.append(full)
if not files:
log('Нет входных .xlsx файлов для слияния. Завершение.')
return
log(f'Найдено файлов: {len(files)}')
dfs = []
all_columns = []
seen_cols = set()
processed = 0
errors = 0
for i, path in enumerate(files, start=1):
fname = os.path.basename(path)
try:
df = pd.read_excel(path, engine='openpyxl')
if df is None or df.empty:
log(f'ПРЕДУПРЕЖДЕНИЕ: Пустой файл — пропуск: {fname}')
continue
if not all_columns:
for c in df.columns:
if c not in seen_cols:
all_columns.append(c)
seen_cols.add(c)
# Добавляем источники
stem, _ = os.path.splitext(fname)
df['SourceFile'] = fname
df['SourceFileUrl'] = BASE_URL + stem
dfs.append(df)
processed += 1
log(f'[{i}/{len(files)}] OK: {fname} — строк: {len(df)}')
except Exception as e:
errors += 1
log(f'[{i}/{len(files)}] ОШИБКА: {fname}: {e}')
log(traceback.format_exc())
if not dfs:
log('Все файлы пустые или не прочитаны — нечего сохранять.')
return
combined = pd.concat(dfs, ignore_index=True)
# порядок колонок: сначала из первого удачного файла, потом прочие (кроме служебных) по алфавиту,
# затем служебные SourceFile и SourceFileUrl в конце
extra_cols = [c for c in combined.columns if c not in all_columns and c not in ('SourceFile', 'SourceFileUrl')]
extra_cols.sort()
final_cols = all_columns + extra_cols
if 'SourceFile' not in final_cols:
final_cols.append('SourceFile')
if 'SourceFileUrl' not in final_cols:
final_cols.append('SourceFileUrl')
combined = combined.reindex(columns=final_cols)
try:
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
combined.to_excel(writer, index=False, sheet_name='Sheet')
log(f'Готово: {output_file}')
log(f'Итоговых строк: {len(combined)}')
log(f'Успешно обработано файлов: {processed}, ошибок: {errors}')
except Exception as e:
log(f'ОШИБКА сохранения {output_file}: {e}')
log(traceback.format_exc())
log('=== Готово ===')
if __name__ == '__main__':
main()