WorkSyncExtractor/scraper.py at master · FroDev-CR/WorkSyncExtractor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
"""
Módulo de extracción de órdenes de SupplyPro usando Playwright
"""
import pandas as pd
import asyncio
import sys
from playwright.async_api import async_playwright
from config import SUPPLYPRO_URL

def log(msg):
    """Log que SÍ se ve en Streamlit Cloud"""
    sys.stderr.write(f"{msg}\n")
    sys.stderr.flush()


async def extraer_ordenes(username: str, password: str) -> pd.DataFrame:
    """
    Extrae las órdenes de SupplyPro usando credenciales específicas

    Args:
        username: Usuario de SupplyPro
        password: Contraseña de SupplyPro

    Returns:
        DataFrame con las órdenes extraídas
    """
    async with async_playwright() as p:
        # Lanzar navegador en modo headless con opciones para cloud
        browser = await p.chromium.launch(
            headless=True,
            args=[
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--disable-dev-shm-usage',
                '--disable-gpu',
                '--disable-web-security',
                '--disable-features=IsolateOrigins,site-per-process'
            ]
        )
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        )
        page = await context.new_page()

        try:
            # Paso 1: Ir a la página de login
            log(f"🔗 Navegando a SupplyPro...")
            await page.goto(SUPPLYPRO_URL, wait_until='networkidle', timeout=60000)
            log(f"✅ Página cargada: {page.url}")

            # Paso 2: Esperar y llenar formulario
            log(f"📝 Llenando formulario de login...")
            await page.wait_for_selector('#user_name', state='visible', timeout=15000)
            await page.type('#user_name', username, delay=100)

            await page.wait_for_selector('#password', state='visible', timeout=15000)
            await page.type('#password', password, delay=100)

            log(f"✅ Credenciales ingresadas")

            # Paso 3: Submit y esperar navegación
            log(f"🚀 Enviando login...")
            submit_button = await page.query_selector('input[type="submit"]')

            # Click y esperar navegación con más tiempo
            await submit_button.click()
            log(f"⏳ Esperando navegación después del login...")
            await page.wait_for_load_state('networkidle', timeout=60000)
            await page.wait_for_timeout(5000)  # Aumentado de 3s a 5s

            current_url = page.url
            log(f"📍 URL después del login: {current_url}")

            # Paso 4: Verificar que el login fue exitoso
            if 'Login.asp' in current_url:
                # Verificar si es un force_signon (sesión activa en otro lugar)
                if 'force_signon=Y' in current_url or 'force%5Fsignon=Y' in current_url:
                    log(f"⚠️ Sesión activa detectada, forzando nuevo login...")

                    # Buscar y hacer click en el botón para forzar el login
                    try:
                        # Intentar encontrar el botón de submit nuevamente
                        force_button = await page.query_selector('input[type="submit"]')
                        if force_button:
                            await force_button.click()
                            log(f"✅ Click en forzar login enviado")
                            await page.wait_for_load_state('networkidle', timeout=60000)
                            await page.wait_for_timeout(3000)

                            current_url = page.url
                            log(f"📍 URL después de forzar login: {current_url}")

                            # Verificar si ahora sí funcionó
                            if 'Login.asp' in current_url:
                                raise Exception("❌ No se pudo forzar el login. Cierra otras sesiones de SupplyPro con este usuario.")
                        else:
                            raise Exception("❌ No se encontró el botón para forzar login.")
                    except Exception as e:
                        raise Exception(f"❌ Error al forzar login: {str(e)}")

                else:
                    # No es force_signon, es un error real
                    page_text = await page.content()

                    # Buscar mensajes de error más específicos
                    if 'invalid' in page_text.lower() or 'incorrect' in page_text.lower():
                        raise Exception("❌ Credenciales incorrectas. Verifica usuario y contraseña.")
                    elif 'locked' in page_text.lower() or 'disabled' in page_text.lower():
                        raise Exception("❌ La cuenta puede estar bloqueada o deshabilitada.")
                    else:
                        # Log más información para debug
                        log(f"⚠️ DEBUG: Username usado: {username}")
                        log(f"⚠️ DEBUG: URL actual: {current_url}")
                        # Buscar si hay algún mensaje de error en la página
                        error_elements = await page.query_selector_all('.error, .alert, .warning, [class*="error"], [class*="alert"]')
                        if error_elements:
                            for elem in error_elements[:3]:
                                text = await elem.text_content()
                                if text and text.strip():
                                    log(f"⚠️ Mensaje en página: {text.strip()}")

                        raise Exception("❌ El login no se completó. Verifica que las credenciales sean correctas en config.py")

            log(f"✅ Login exitoso")

            # Paso 5: Buscar y hacer click en "Newly Received Orders"
            print(f"🔍 Buscando link 'Newly Received Orders'...")

            # Esperar un poco más para que la página cargue completamente
            await page.wait_for_timeout(5000)

            # Intentar múltiples selectores
            orden_clickeado = False

            # Intento 1: Link con texto exacto
            try:
                link = page.locator('a:has-text("Newly Received Orders")').first
                if await link.count() > 0:
                    print(f"✅ Link encontrado (método 1)")
                    await link.click(timeout=10000)
                    orden_clickeado = True
            except Exception as e:
                print(f"⚠️ Método 1 falló: {str(e)}")

            # Intento 2: Buscar en toda la página
            if not orden_clickeado:
                try:
                    all_links = await page.query_selector_all('a')
                    for link in all_links:
                        text = await link.text_content()
                        if text and 'Newly Received Orders' in text:
                            print(f"✅ Link encontrado (método 2)")
                            await link.click()
                            orden_clickeado = True
                            break
                except Exception as e:
                    print(f"⚠️ Método 2 falló: {str(e)}")

            # Intento 3: Si no encontró "Newly Received Orders", hacer click en "Orders" primero
            if not orden_clickeado:
                try:
                    print(f"🔍 Intentando click en 'Orders' primero...")
                    orders_link = page.locator('a:has-text("Orders")').first
                    if await orders_link.count() > 0:
                        await orders_link.click(timeout=10000)
                        await page.wait_for_load_state('networkidle', timeout=30000)
                        await page.wait_for_timeout(3000)
                        print(f"✅ Click en 'Orders' exitoso")

                        # Ahora buscar "Received" o "Newly Received Orders"
                        received_link = page.locator('a:has-text("Received")').first
                        if await received_link.count() > 0:
                            print(f"✅ Link 'Received' encontrado")
                            await received_link.click(timeout=10000)
                            orden_clickeado = True
                except Exception as e:
                    print(f"⚠️ Método 3 falló: {str(e)}")

            if not orden_clickeado:
                # Obtener todos los links disponibles para debug
                all_links = await page.query_selector_all('a')
                links_text = []
                for link in all_links[:20]:  # Primeros 20 links
                    text = await link.text_content()
                    if text:
                        links_text.append(text.strip())

                raise Exception(f"❌ No se encontró el link 'Newly Received Orders'. Links disponibles: {', '.join(links_text[:10])}")

            # Paso 6: Esperar a que cargue la página de órdenes
            print(f"⏳ Esperando página de órdenes...")
            await page.wait_for_load_state('networkidle', timeout=60000)
            await page.wait_for_timeout(3000)

            # Paso 7: Esperar el filtro
            print(f"🔍 Buscando filtro de órdenes...")
            await page.wait_for_selector('select[name="ref_epo_filter"]', state='visible', timeout=30000)
            print(f"✅ Filtro encontrado")

            # Paso 8: Seleccionar filtro y ampliar filas por página
            print(f"⚙️ Aplicando filtro...")
            await page.select_option('select[name="ref_epo_filter"]', label='Show All Except EPOs')
            await page.wait_for_timeout(5000)
            await page.wait_for_load_state('networkidle', timeout=30000)
            print(f"✅ Filtro aplicado")

            # Aumentar filas por página para capturar todas en una sola página
            try:
                rpp = await page.query_selector('select[name="rowsPerPage"], select[name="rows_per_page"], select[name="RowsPerPage"]')
                if rpp:
                    # Intentar con 200, si no existe probar con el valor más alto disponible
                    options = await rpp.query_selector_all('option')
                    values = [await o.get_attribute('value') for o in options]
                    target = '200' if '200' in values else (values[-1] if values else None)
                    if target:
                        await rpp.select_option(value=target)
                        await page.wait_for_load_state('networkidle', timeout=30000)
                        await page.wait_for_timeout(3000)
                        print(f"✅ Filas por página ajustado a {target}")
            except Exception as rpp_err:
                print(f"⚠️ No se pudo ajustar filas por página: {rpp_err}")

            # Paso 9: Extraer tabla
            print(f"📊 Extrayendo tabla de órdenes...")
            table_element = await page.query_selector('//th[contains(normalize-space(.), "Builder")]/ancestor::table')

            if not table_element:
                raise Exception("❌ No se encontró la tabla de órdenes. Puede que no haya órdenes disponibles.")

            # Obtener HTML de la tabla
            table_html = await table_element.inner_html()
            full_table_html = f"<table>{table_html}</table>"

            # Convertir a DataFrame SIN interpretar headers (igual que el código original)
            from io import StringIO
            import tempfile
            import os

            df = pd.read_html(StringIO(full_table_html), header=None)[0]
            print(f"✅ Extraídas {len(df)} filas")

            # Guardar a CSV temporal (como hace el código original)
            temp_csv = tempfile.mktemp(suffix='.csv')
            df.to_csv(temp_csv, index=False, encoding='utf-8-sig', header=False)
            print(f"✅ Guardado a CSV temporal")

            # Leer de vuelta SIN headers (como el código original)
            df_final = pd.read_csv(temp_csv, header=None, dtype=str, encoding='utf-8-sig')
            print(f"✅ Releído desde CSV: {len(df_final)} filas")

            # Limpiar archivo temporal
            try:
                os.remove(temp_csv)
            except:
                pass

            # Paso 10: Seleccionar todas las órdenes y aceptarlas
            print("🔍 Diagnosticando elementos de aceptación...")
            try:
                # ── DEBUG: listar todos los inputs y selects visibles ──────────
                all_inputs = await page.query_selector_all('input[type="checkbox"]')
                print(f"  checkboxes encontrados: {len(all_inputs)}")
                for inp in all_inputs[:5]:
                    name = await inp.get_attribute('name') or ''
                    id_  = await inp.get_attribute('id') or ''
                    print(f"    checkbox name={name!r} id={id_!r}")

                all_selects = await page.query_selector_all('select')
                print(f"  selects encontrados: {len(all_selects)}")
                for s in all_selects:
                    name = await s.get_attribute('name') or ''
                    opts = await s.query_selector_all('option')
                    opt_texts = []
                    for o in opts:
                        t = await o.text_content()
                        opt_texts.append((t or '').strip())
                    print(f"    select name={name!r} opciones={opt_texts}")

                all_btns = await page.query_selector_all('input[type="submit"], input[type="button"], button')
                print(f"  botones encontrados: {len(all_btns)}")
                for b in all_btns:
                    val  = await b.get_attribute('value') or ''
                    txt  = await b.text_content() or ''
                    name = await b.get_attribute('name') or ''
                    print(f"    btn value={val!r} text={txt.strip()!r} name={name!r}")

                # Screenshot para ver el estado de la página
                try:
                    await page.screenshot(path='/tmp/debug_accept.png', full_page=False)
                    print("  Screenshot guardado en /tmp/debug_accept.png")
                except Exception:
                    pass

                # ── Seleccionar todas: header checkbox ────────────────────────
                header_cb = None
                # El primer checkbox suele ser el del header
                if all_inputs:
                    header_cb = all_inputs[0]
                    name = await header_cb.get_attribute('name') or ''
                    print(f"✅ Usando primer checkbox (name={name!r}) como header")
                    await header_cb.click()
                    await page.wait_for_timeout(800)

                # ── Seleccionar acción "Accept Selected Orders" ───────────────
                action_selected = False
                for s in all_selects:
                    opts = await s.query_selector_all('option')
                    for opt in opts:
                        txt = (await opt.text_content() or '').strip()
                        if 'Accept Selected Orders' in txt:
                            val = await opt.get_attribute('value') or txt
                            await s.select_option(value=val)
                            action_selected = True
                            sname = await s.get_attribute('name') or ''
                            print(f"✅ Acción seleccionada en select name={sname!r} value={val!r}")
                            break
                    if action_selected:
                        break

                if not action_selected:
                    print("⚠️ No se encontró 'Accept Selected Orders' en ningún select")
                else:
                    await page.wait_for_timeout(500)
                    # ── Click en Update ───────────────────────────────────────
                    updated = False
                    for b in all_btns:
                        val = (await b.get_attribute('value') or '').strip()
                        txt = (await b.text_content() or '').strip()
                        if val == 'Update' or txt == 'Update':
                            await b.click()
                            updated = True
                            print(f"✅ Click en Update (value={val!r})")
                            break

                    if updated:
                        await page.wait_for_load_state('networkidle', timeout=30000)
                        await page.wait_for_timeout(2000)
                        print("✅ Órdenes aceptadas exitosamente")
                    else:
                        print("⚠️ No se encontró el botón Update")

            except Exception as accept_err:
                print(f"⚠️ Error al aceptar órdenes (extracción OK): {accept_err}")

            # Cerrar sesión
            try:
                await page.click('text=Sign Out', timeout=5000)
                await page.wait_for_timeout(1000)
            except:
                pass

            return df_final

        except Exception as e:
            # Capturar screenshot para debugging
            try:
                screenshot_bytes = await page.screenshot(full_page=True)
                # Guardar para debugging
                with open('/tmp/error_screenshot.png', 'wb') as f:
                    f.write(screenshot_bytes)
                error_msg = f"{str(e)}\n\n[DEBUG] Screenshot guardado en /tmp/error_screenshot.png"
            except:
                error_msg = str(e)

            raise Exception(error_msg)

        finally:
            await browser.close()


def ejecutar_extraccion(username: str, password: str) -> pd.DataFrame:
    """
    Wrapper síncrono para la función asíncrona de extracción
    """
    return asyncio.run(extraer_ordenes(username, password))