diff --git a/src/scrapers/player-puppeteer.js b/src/scrapers/player-puppeteer.js index 2fcb39b..0a01b5b 100644 --- a/src/scrapers/player-puppeteer.js +++ b/src/scrapers/player-puppeteer.js @@ -156,81 +156,144 @@ async function getNewTournamentRounds(browser, pdgaNumber, afterDate) { logger.info(`Looking for tournaments after ${afterDate.toDateString()}...`); - const newTournamentUrls = await page.evaluate((afterTimestamp) => { + const { urls: newTournamentUrls, counts } = await page.evaluate((afterTimestamp) => { const afterDate = new Date(afterTimestamp); const tables = document.querySelectorAll('table[id*="player-results"]'); const urls = []; - - tables.forEach(table => { - const rows = table.querySelectorAll('tbody tr'); + const seenUrls = new Set(); + let table = 0; + let recentEvents = 0; + let recentEventsAnchorsSeen = 0; + let recentEventsSkippedDuplicates = 0; + + tables.forEach(tbl => { + const rows = tbl.querySelectorAll('tbody tr'); rows.forEach(row => { const dateCell = row.querySelector('.dates'); const tournamentCell = row.querySelector('.tournament a'); - + if (dateCell && tournamentCell) { const dateText = dateCell.innerText.trim(); const dateMatch = dateText.match(/\d{1,2}-[A-Za-z]{3}-\d{4}/); - + if (dateMatch) { const dateStr = dateMatch[0]; const date = new Date(dateStr); - + if (date > afterDate) { const href = tournamentCell.getAttribute('href'); if (href) { - urls.push({ - url: `https://www.pdga.com${href}`, - date: dateStr, - name: tournamentCell.innerText.trim() - }); + const absoluteUrl = new URL(href, location.origin).href; + if (!seenUrls.has(absoluteUrl)) { + seenUrls.add(absoluteUrl); + urls.push({ + url: absoluteUrl, + date: dateStr, + name: tournamentCell.innerText.trim(), + source: 'table' + }); + table++; + } } } } } }); }); - - return urls; + + const recentAnchors = document.querySelectorAll('.recent-events a[href*="/tour/event/"]'); + recentAnchors.forEach(anchor => { + recentEventsAnchorsSeen++; + const href = anchor.getAttribute('href'); + if (href) { + const absoluteUrl = new URL(href, location.origin).href; + if (seenUrls.has(absoluteUrl)) { + recentEventsSkippedDuplicates++; + } else { + seenUrls.add(absoluteUrl); + urls.push({ + url: absoluteUrl, + date: null, + name: anchor.innerText.trim() || 'Recent event', + source: 'recent-events' + }); + recentEvents++; + } + } + }); + + return { urls, counts: { table, recentEvents, recentEventsAnchorsSeen, recentEventsSkippedDuplicates } }; }, afterDate.getTime()); - - logger.info(`Found ${newTournamentUrls.length} new tournaments after ${afterDate.toDateString()}`); + + logger.info({ + pdgaNumber, + afterDate: afterDate.toISOString(), + tableMatches: counts.table, + recentEventsMatches: counts.recentEvents, + recentEventsAnchorsSeen: counts.recentEventsAnchorsSeen, + recentEventsSkippedDuplicates: counts.recentEventsSkippedDuplicates, + totalUrlsToScrape: newTournamentUrls.length + }, 'new tournament URL discovery completed'); for (const tournamentData of newTournamentUrls) { try { - logger.info(`Scraping new tournament: ${tournamentData.name} (${tournamentData.date})`); - + if (tournamentData.source === 'recent-events') { + logger.debug({ pdgaNumber, url: tournamentData.url }, 'recent-events: scraping tournament'); + } else { + logger.info(`Scraping new tournament: ${tournamentData.name} (${tournamentData.date})`); + } + await page.goto(tournamentData.url, { waitUntil: 'domcontentloaded', timeout: 30000 }); await new Promise(r => setTimeout(r, 500)); - + + let parsedDate; + if (tournamentData.date !== null) { + parsedDate = parseDate(tournamentData.date); + } else { + const eventDateStr = await page.evaluate(() => { + const body = document.body ? document.body.innerText : ''; + const m = body.match(/\d{1,2}\s+to\s+\d{1,2}-[A-Za-z]{3}-\d{4}/) + || body.match(/\d{1,2}-[A-Za-z]{3}-\d{4}/); + return m ? m[0] : null; + }); + + if (eventDateStr) { + parsedDate = parseDate(eventDateStr); + logger.debug({ pdgaNumber, url: tournamentData.url, eventDateStr }, 'recent-events: extracted date from event page'); + } else { + logger.warn({ pdgaNumber, url: tournamentData.url }, 'recent-events: could not extract date from event page, skipping tournament'); + continue; + } + } + const roundRatings = await page.evaluate((pdgaNum) => { const rows = document.querySelectorAll('tr'); - + for (const row of rows) { const cells = row.querySelectorAll('td'); - const hasPlayerNumber = Array.from(cells).some(cell => + const hasPlayerNumber = Array.from(cells).some(cell => cell.innerText && cell.innerText.includes(pdgaNum.toString()) ); - + if (hasPlayerNumber) { const roundRatingCells = row.querySelectorAll('td.round-rating'); const ratings = []; - + roundRatingCells.forEach(cell => { const rating = parseInt(cell.innerText.trim()); if (!isNaN(rating) && rating > 0) { ratings.push(rating); } }); - + return ratings; } } - + return []; }, pdgaNumber); - + if (roundRatings.length > 0) { - const parsedDate = parseDate(tournamentData.date); roundRatings.forEach(rating => { newRounds.push({ rating, @@ -238,10 +301,10 @@ async function getNewTournamentRounds(browser, pdgaNumber, afterDate) { competition: tournamentData.name }); }); - + logger.info(`Found ${roundRatings.length} round ratings for ${tournamentData.name}`); } - + } catch (error) { logger.error(`Error scraping tournament ${tournamentData.name}: ${error.message}`); }