From b3c1ee2b6900527322f54942fd723b1cf8f74bca Mon Sep 17 00:00:00 2001 From: va1is Date: Sun, 24 Aug 2025 15:11:06 +0300 Subject: [PATCH] NEXT --- .gitignore | 5 +- Parser_NEXT/.env.example | 2 + Parser_NEXT/README.md | 17 + Parser_NEXT/categories.xlsx | Bin 0 -> 8840 bytes Parser_NEXT/config.yaml | 53 ++ Parser_NEXT/fetcher.py | 636 ++++++++++++++++ Parser_NEXT/main.py | 193 +++++ Parser_NEXT/models.py | 23 + Parser_NEXT/parser.py | 115 +++ Parser_NEXT/requirements.txt | 8 + Parser_NEXT/sink.py | 108 +++ .../tests/fixtures/category_sample.html | 9 + Parser_NEXT/tests/test_parser.py | 9 + Parser_NEXT/utils.py | 19 + Parsing ZARAHOME/src/categories.xlsx | Bin 17012 -> 16275 bytes ...х эксель файлов из папки в один одинаковый формат.py | 2 +- python3.13 | 0 Парсер_IKEA/dictionary_main.txt | 12 + Парсер_IKEA/exclusion_materials.txt | 1 + Парсер_IKEA/ikea_products_flat.xlsx | Bin 11517 -> 19410903 bytes Парсер_IKEA/leaf_categories copy.txt | 121 +++ Парсер_IKEA/leaf_categories.txt | 6 +- Парсер_IKEA/leaf_categories.xlsx | Bin 47586 -> 39378 bytes Парсер_IKEA/links.txt | 58 +- Парсер_IKEA/log_all_CatProd.txt | 27 +- Парсер_IKEA/main.py | 699 +++++++++++++++--- Парсер_IKEA/product_links.txt | 29 +- Парсер_IKEA/product_links.xlsx | Bin 7473 -> 8647 bytes Парсер_IKEA/result.xlsx | Bin 13855 -> 29567 bytes Парсер_IKEA/~$leaf_categories.xlsx | Bin 0 -> 165 bytes 30 files changed, 1987 insertions(+), 165 deletions(-) create mode 100644 Parser_NEXT/.env.example create mode 100644 Parser_NEXT/README.md create mode 100644 Parser_NEXT/categories.xlsx create mode 100644 Parser_NEXT/config.yaml create mode 100644 Parser_NEXT/fetcher.py create mode 100644 Parser_NEXT/main.py create mode 100644 Parser_NEXT/models.py create mode 100644 Parser_NEXT/parser.py create mode 100644 Parser_NEXT/requirements.txt create mode 100644 Parser_NEXT/sink.py create mode 100644 Parser_NEXT/tests/fixtures/category_sample.html create mode 100644 Parser_NEXT/tests/test_parser.py create mode 100644 Parser_NEXT/utils.py create mode 100644 python3.13 create mode 100644 Парсер_IKEA/dictionary_main.txt create mode 100644 Парсер_IKEA/exclusion_materials.txt create mode 100644 Парсер_IKEA/leaf_categories copy.txt create mode 100644 Парсер_IKEA/~$leaf_categories.xlsx diff --git a/.gitignore b/.gitignore index 3d0e2a0..f9adf74 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ Temporary Items .apdisk __pycache__ -/Parsing ZARAHOME/src/records_folder +records_folder Ignore_Temp -/Processing/Files-todo \ No newline at end of file +/Processing/Files-todo +out \ No newline at end of file diff --git a/Parser_NEXT/.env.example b/Parser_NEXT/.env.example new file mode 100644 index 0000000..7e68a3f --- /dev/null +++ b/Parser_NEXT/.env.example @@ -0,0 +1,2 @@ +# PROXY=http://user:pass@host:port +# RATE_LIMIT=1.0 diff --git a/Parser_NEXT/README.md b/Parser_NEXT/README.md new file mode 100644 index 0000000..3c73b01 --- /dev/null +++ b/Parser_NEXT/README.md @@ -0,0 +1,17 @@ +# NEXT.pl Parser (Playwright, Python 3.12) + +## Quick start +```bash +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -r requirements.txt +python -m playwright install chromium +python main.py +``` + +**categories.xlsx** — формат ввода: +- Первая колонка (A) — ссылки на категории (без заголовка). +- Любые другие колонки (B, C, …) игнорируются (можно писать пометки). +- Пустые строки и ячейки не учитываются. + +Outputs land in **records_folder/** as XLSX (+CSV/JSONL). Configure selectors/scroll in **config.yaml**. diff --git a/Parser_NEXT/categories.xlsx b/Parser_NEXT/categories.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..fbf50da18b913bd6a6a756bc38675561d761fcb7 GIT binary patch literal 8840 zcmeHNhgVbC_6`Dq6bZctLXj>lK#(FJAU#Nvj`Uun_a*|;oAlmAP(ct-q=gbdf^;w- zAiekAei`4qH#5%s{(|>z*1Bh{%b%EX2COo)5TzzW#s5fAI`dru3=7`G9iA z3KtI+c$B7Uq;Q2cL%T^hA4#@$BzIMq8|2#BUgy3z2g($X+lf?@3{3e6?DpH#I5~a@ zjqLiQK@c5+Xw^2R72)md-e7H^q)u|x*W1n~qm|;PHZh5{$^xW6t%qwsWap%nDrs!U zDI|mEbG~)!5YKcq5>)7yiy}~MvuZluiJ`r(hJo{gq$75=9Dc_8^E`@J(M*0*6rRQ< z>R??>4l|8L*RwR`W;gZ*!O<_uE~ywS3mBz$^d&> z!7Kq0o3n^jF~5zkiw4iOc2ie_1g>rZ0N2-;0QJAfvQC?i`2fu|6}0N`(XupgwRCXf zz5Cnwe{%dU*5IEWy)sD^2InIP-&VMYKpc-wCje!Xy`&XBfwcmI6{m>mVvAU4r(4+Q zfLb)qFdqhe4!r0^O^e5_A;3pJAeGT1WMa%8ysE;}&YpVUaNcoEm48|}*Fo$#dN_KN z@leT!-Scxi=j(>@0_C0=X8GY=*=pdx-N!VyNK5F#DI_vN^t)8`zMGxA!5Ed%+^h8kMp%Di2U*MCr?CRu{ z^QupCVi5fXOiYx)OD$w89h}@*PaBTaipS?*pa%f&$u;t8|M1CNs$@Bn)lreM;Luhi zAQVDG{qpYQ}lP zvR3`Zkh2euk)mmK=@o{Ynqu;*l7busHsbpa#3biQ=T5gIM7o|dEbarRsJ2F5i71B6 zTh@SLhI3Gtv&K6KWZQXy46ItHUTEsQ#x9Otr(fj>Ap8@~{Og1YzhBmcig_dA3oc@l?LitCU@upz*qx052+O17-O|`Dn z;=5mXs2ZO|JeQ`zsy4dSMMWOYAx$8b)*ejc^-*%Y@#eXB+N{em<+-L?=j@Y^7AdaU zusaq7)*TCKRZ(O}~y9FmiWh8cse)%nn z6V2CI`1ut{$XLN5I`2HSKq)yV1kD=eU4&=1mmU7sEc^-*@u01o_;&UKzbzY}sn-bp z`PLKhxG8DKma6ZdO^ja=C9vp6n36`8|6^bLoI>E5KiNkn)ZNg_;-$%uuJAqMHF!PMIs111h&U%{R7}n+2PDL-rWs zHWSrDEK>_bV(#!1DSI2DnoJiy7mgT~NmZ@V57LFu`gNK$)H}|wP;=)^NQ^+>s>GiL z*g4pBzkBw#Gsf)X>kUqC!&}g7{X3sSBvaD%(Z75|bD9Q#i-G3zzx@86S^e)AfPqfQ z(5m?FJ}Na-l{@%=pYg9E_`I?_$%!Z3d6_n}*NJd@YB^AMz=DAXGqju^jdaJ9c`=>C zeYX3%J&%QM|G*?YY~w1ABEj+{b=VfecJANr$HX<;tOA8T#3m)#SX+O+iGMrOjkHlR z3BjF923t>}zkk?yrJs6t4>Kir)$ z0`ZU`h$G2uu`uJ4_V%2+%y}Pq=Vot+ zzFPa_V+Ttyzz{y4V%hQzt2W7yA_{ zBNq=`fv?R+EPg*#T>&sjhZ2%+nr$d|g>JM)CNzS$<7$z##Jj_9^;RSzt#mCQTzme= z81>X#+>(~(bKi6ny36*6XmSIGprVPpv)ImC|I$e1e0Bfw)kyzr5J9L zu`!=V_Lb+Y+hHs?@m?VV2i$@kZ&PBDzGIJf5%g@N?0cLZZ{*$*O?tE5atpql9+9Jj z<1J0Tt^8s@Ugv3u>ddXI_;n?GjM z!AeC>rzPCYhD*W|n_-2$w8QY*>i%5fl#v;{P(HHm-N%U%Qoa|?Dg`@o7K9&bT+9xS zavo(n*5tdD?RPI*`M+GMIJ~E}4nZs)DWn<)D|`vaB$T9gOB6529@*Wase9yW?3YOI z%CBHq#y0iiMcc6>!r~*LfcA{D^|~%k=Y!&&hx^V9^$T!hz1kOEDoLlF0k38cv%sF6 zr^X{!>XNi42C;@L3mSL;7X?n-K;-GNI%A&?%DfE>Q&Z z>1=xI77n7Ke}K@?xh()t#w+F-(%3GOs7ZFB7-G*!CQ2B>K->_8{(|siGI)K7xpXV- zUqMZ-q%BigQd$?6rcV6k7yqu6_6XEo0Q!dGw7+e2e`>|u z#?sM}_viViOg8kCooPjY&6F!aj7PaErive85FxC}PqU!>4yROU)w zAFBJx;j>l}nA_avr8UcQYv#Kom9u4$N)JMaLY&%_MqL{pDer z2rGNNf>)8swfMxmtGqARprY6qH)CSlA@bAtN%LGo;h-z0m`Xn87k!K!hCR5XPx+0G zcUn97h_GpZmS43h-@w( z6Zpa@`9WDPX$u9w*a9jBfL<|;Y#(3{^tLq_Ccttibe{td!UW4&Nf;9U%76%e#$ ztJ|!wJYv5e`ZQQ9`@JxwG3L}r6CTADYzQ%6s1ns65kutPBnI^`MCDkht6EF1PBfE} zKKx=7uKAofiRAf#pyZv7qUiu7`cGS5)m?fX;!T&8$G#e83ApES!?xNs>&|T2E10!Q zoq1M{UwMB9Y@Nq-mI(~z+sB<@`@%m@DE@kOe*yIl(An`*;pwjr#=S#qzRUh^)m0URp4TH#bQ)WW{R=w|8h!vu~@FsM(yA z*7UQFZ1&Yih;<9k0ry!4j;hT9NVZIL3+s1Srdbon`yP&1K5wr4+7{F-lb6y*X_7`+ zLRE24xGikt?OJ-rUzQs6SX16faX3O58WS#|3xS7bfR-`!Dd6KxmtHQ3tyV-Nh5@qL zO$_q{z9&)C%FG$baV)(JVWLCaSs3YbMUGv%c#cgOGV8vQ)!MRVb?^vJpAu*UvM2o& zM8Iu3W-#iGi;fl7>+P;evyyv*4sYzpnKL;O;_TdBw&nFzwEdXM{JO}v65HBKNVQV8 z!FT>dKK$F3r+i+c-1}yA${s2r#VzKloZ^-FX zl=G}3o#<~mzH6LGIE2r;N++2b$IWwxO?l9i%VzBLA>cXG9-EmeYtB9?WW2X{y|5(e$6 zGq|&t1`9rO2YX`BgNHg+p*#wCw=!J}h>1|<$YWgI1PaPP2OjYOC~` zBye`}UFE$h93X9o)1|f}3UkTg<{LWwhS#9ZAHkW@3E8;SAS+mhS&026aE>2~e9AWK zI+SkZT6me8FnNqmyF!8nEIlW#QO#&U*_-sDSu;|5?|omAg(#Tx5%yC{t4>e?X8YO4 zo5BZ^pnNKsX|3cY0;;Lq6StscYDlDe@;+?S-$5X+UgGO?h-3A}mEaLmoy;IF9k%NB zq_uB_kEl1iEf85Wrc=op`}$Cjp^n;+6MTt-yw52kuR^tsSvPQ;|AEQ-ttpBN>rp_b zFi3$?!2pEw6*?(LtCE@GsJpjy)Q)0r*hPv>i<%&Y>Q~(dY&U$|mTKHwcl)Q8Ph)Zn zCaOKEhY!xVepce4cNF2>>vl>$HUbidJs!J`R>{Kex8p{ZUi$z-FAi+lAZumm4+$f(in; zIpkDy<`aYq-AG|@K%ITS*Visj>n7=j}0KU**S zhyrc~<3D0sQChcK^zjIkgT8@T$YEutXgsYneeG#%#1R_(jq2&r^XB`<(9#=+ARX~* z5F6d=^-@>29Zm8-Ze@f^obLKUhGr!uQ*hfi7Cg!U zHv%Q}QXy<6;hzptLCA-&+0($>P{cl5Y{I|;1`~uO(%KD03^k<4jH}(HEg&V6$qx@5 ztxITm@r6!0oJil+u*JoR4^uI?34&xz?j20H_XMegwNyL~g~vFN3)!(QrEK~=wZRfb z;Yg^3_%R=-&;>wjr>Lr&rxt0Ww?A#njtAQ9z^JqGH?eIAEDK(cP-owZI9zPLp?ia~ zfru#X>akzhjwEi;ix6vGmH8?yZ1*p%?+a=vOZb^fZbGBB-h6r7hxgs!m5`P;4JFxj zaqAa#h%OxkfQ>E2DE-Y@UDLA)9{RzZm~}R}(ER&^jc}4+y424tftXjmX++1?8UVFXK^^oAM?MLAD$uu zC72_s;wo+@p9)no5xcnnJjDvuIb)_-eU62+)I>7bjN%S) zk@+`9>FK;_lj)Y}2-M^Hw{t$jm(u}j-=#%R4Re`}4>C&I^K~bd=sB%jc0Tw8oi825 zZ!z#)L8)Y%?u38Xl=lTU6w#1LOC3M3(hd?p_?W=j0?IjO5p2Y5S35S*jln z9$ar6_dtSLx$EyUth{{kp@Le)-vi(m?9Jax0+~FRL$v`+Vdf^`Zr4xAPMkc~aJC%c zD7VZ4X&Uv&!m>^s0zj67EJrK3f-}dV`PY8wvqdjUv-vo~4GF0;mB5PcE^p#GBf>6dyKPZ(zD>NFNIg`PSWdJ?-K*%&)A8ytDO^v}Ztne}$38iT?p-BBlmJqZDF`#OkWS zOrScr^gJ&w+Eo-{Mk?YROAEQ}F$gvpQiR7{e(iD5S!QbCR?eDbalB2FWEG(&7L$K0 zPY3SmLuZD>>51a+Bv%G;RTS2joaeP0h6UWu2+o|-^NAS4z)*q@hAt9h8Un;gTCx`w z@5>s!b;;jLG0s1bllOt*f3>MJTGwm0sW~*{40)~38~LNzk=R5qZ{@|iCE6D56>GP5bwA@#D;ZQxG$1O;7oX*2FNhkZHQdeX*}?v&BpomSUwDk3Zd3H8mKa?F zv2Zq5cXf7g<2847wfwys^}o^wnujq-)xULP!o66&gZFZwiUwMhke2|DLJ|@=M&Z8Z z55(2xli{_r1Fgw?e2i@m!bI8?zX7#wcI{stuYO8SUX@??PHK}(?iyOiC<{{1*-?3O zXT^$tzZ)T#s-(6r(~jlXmT&9ppTZ}nTQH|YiC5d->K(^IXlfTaHpUrRZE6o3!MY1J za?`VsuHrhxGeE)z%{Wz zu({UBAQIKT{GNzO8S710ml$^kD7j(8d8S9RvyI7^Pi()7S6({yG zo(EX=lq*16@7d$;Q}rtfxyz@|JLn6zs>SgiKH$H1_uiSw@K9DR@3iAlNJ!lom5ezd zutj4=mo#9H+*0I8C3D`|#xU|p(SlZKx?Aj$r2e;c!yem;v70o8d$>l@$?84Jp&#W} z#_Mm;ChO4i+=DEbr*04#&o&~OrQ1Q3KzdpHII4GPu{}T$@S_o;AkY;k5EMus$kh{C z4Gq)WgnZaO^ar}QaBdLF1D&#&oVbP3n{~J%5Ab&VmBjgCnwXIv<%^##r8s*Fz|Ok; zR#5<_fmri{&X3!x!VVYYK*_58+VU|WS?*JuxOGX25g}&*;m=z4Lq)5_)lVqaf1gPy#Bf8|L-;Y_xKNQ0H`bf)xcl7lm7<(Hm0G4 z@~2+qufShBHh(~y(E9tOfAcH&ujTPSplDx+u73Xi#qwY6{8}^p!%_?WfBwYZDyY9& z`L&w#hn4l)|5#i4)xfU_{vQT1$bTC6Bi;WM`fE<|2NX>GFX*os%C8pw8fO2%qhAD} r0Ra9Mbbp2a)i3@T4x;}P{C6*@u8fUdT>tcb!<~ literal 0 HcmV?d00001 diff --git a/Parser_NEXT/config.yaml b/Parser_NEXT/config.yaml new file mode 100644 index 0000000..507640c --- /dev/null +++ b/Parser_NEXT/config.yaml @@ -0,0 +1,53 @@ +base_url: "https://www.next.pl/en" +locale: "en-GB" +timezoneId: "Europe/Warsaw" + +# На время отладки удобно видеть браузер: +headless: false + +nav_timeout_ms: 60000 +wait_timeout_ms: 30000 +retries: 3 + +# Рейт-лимит можно настраивать при масштабировании +rate_limit_per_host_per_sec: 1.0 + +scroll: + # Старые параметры (используются в резервном auto_scroll и для пауз) + max_scrolls: 80 + pause_ms_between_scrolls_min: 300 + pause_ms_between_scrolls_max: 700 + stop_if_no_new_items_after: 8 + + # Новые параметры для auto_scroll_until_total + hard_max_scrolls: 2500 # предохранитель на максимум скроллов + wait_networkidle_timeout_ms: 8000 # ожидание networkidle после каждого скролла + +selectors: + # карточки товаров + product_tile: '[data-testid="plp-product-grid-item"], [data-testid="product-tile"], .ProductCard, [data-qa="plp-product"]' + product_link: 'a[href*="/style/"], a[href*="/p/"], a[data-testid="productLink"]' + product_name: '[data-testid="product-name"], .productName, [itemprop="name"]' + product_price: '[data-testid="price"], [itemprop="price"], .price' + + # признак готовности + grid_ready: 'script[id^="next-product-summary-script-"], [data-testid="plp-product-grid-item"], [data-testid="product-grid"], .plpGrid, [data-qa="plp-grid"]' + + # счётчик общего количества в шапке (например "(434)") + total_count: '#plp-seo-heading .esi-count, .esi-count' + +xhr_patterns: + - "/search" + - "/api/search" + - "/plp" + - "/productsummary" + +output: + folder: "records_folder" + excel_prefix: "next_dump" + csv_also: true + jsonl_also: true + +debug: + dump_always: false # true — чтобы писать дампы на каждом шаге + diff --git a/Parser_NEXT/fetcher.py b/Parser_NEXT/fetcher.py new file mode 100644 index 0000000..4753a2f --- /dev/null +++ b/Parser_NEXT/fetcher.py @@ -0,0 +1,636 @@ +import asyncio +import logging +import re +import json +import os +from datetime import datetime +from pathlib import Path +from typing import List, Dict, Any, Optional +import re +from playwright.async_api import async_playwright +from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type + + +# ---- Price parsing helpers ---- +_PLN_PRICE_RE = re.compile( + r'(? float | None: + """ + '1 299,00 zł' / '1299 zł' / '1 299 zł' -> 1299.00 + Возвращает None, если распарсить не удалось. + """ + if not price_text: + return None + t = ( + price_text + .replace("\u00a0", " ") # NBSP + .replace("\u2009", " ") # thin space + .strip() + ) + m = _PLN_PRICE_RE.search(t) + if not m: + return None + num = m.group(1) + num = num.replace(" ", "").replace("\u00a0", "").replace("\u2009", "") + num = num.replace(",", ".") + try: + return float(num) + except Exception: + return None + + +class FetchError(Exception): + pass + + +class Fetcher: + """ + Browser layer: Playwright Chromium with anti-bot hygiene and robust debug dumps. + - Blocks heavy resources (fonts/media/images), keeps stylesheets. + - Waits for either SSR summary scripts or window.ssrClientSettings. + - Two ways to read product summaries: + 1) window.ssrClientSettings.productSummary + 2) inline