Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0d2f0fa3a8 | |||
| ec3ae872da |
@@ -156,81 +156,154 @@ async function getNewTournamentRounds(browser, pdgaNumber, afterDate) {
|
||||
|
||||
logger.info(`Looking for tournaments after ${afterDate.toDateString()}...`);
|
||||
|
||||
const newTournamentUrls = await page.evaluate((afterTimestamp) => {
|
||||
const { urls: newTournamentUrls, counts } = await page.evaluate((afterTimestamp) => {
|
||||
const afterDate = new Date(afterTimestamp);
|
||||
const tables = document.querySelectorAll('table[id*="player-results"]');
|
||||
const urls = [];
|
||||
|
||||
tables.forEach(table => {
|
||||
const rows = table.querySelectorAll('tbody tr');
|
||||
const seenUrls = new Set();
|
||||
let table = 0;
|
||||
let recentEvents = 0;
|
||||
let recentEventsAnchorsSeen = 0;
|
||||
let recentEventsSkippedDuplicates = 0;
|
||||
|
||||
tables.forEach(tbl => {
|
||||
const rows = tbl.querySelectorAll('tbody tr');
|
||||
rows.forEach(row => {
|
||||
const dateCell = row.querySelector('.dates');
|
||||
const tournamentCell = row.querySelector('.tournament a');
|
||||
|
||||
|
||||
if (dateCell && tournamentCell) {
|
||||
const dateText = dateCell.innerText.trim();
|
||||
const dateMatch = dateText.match(/\d{1,2}-[A-Za-z]{3}-\d{4}/);
|
||||
|
||||
|
||||
if (dateMatch) {
|
||||
const dateStr = dateMatch[0];
|
||||
const date = new Date(dateStr);
|
||||
|
||||
|
||||
if (date > afterDate) {
|
||||
const href = tournamentCell.getAttribute('href');
|
||||
if (href) {
|
||||
urls.push({
|
||||
url: `https://www.pdga.com${href}`,
|
||||
date: dateStr,
|
||||
name: tournamentCell.innerText.trim()
|
||||
});
|
||||
const absoluteUrl = new URL(href, location.origin).href;
|
||||
if (!seenUrls.has(absoluteUrl)) {
|
||||
seenUrls.add(absoluteUrl);
|
||||
urls.push({
|
||||
url: absoluteUrl,
|
||||
date: dateStr,
|
||||
name: tournamentCell.innerText.trim(),
|
||||
source: 'table'
|
||||
});
|
||||
table++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return urls;
|
||||
|
||||
const recentAnchors = document.querySelectorAll('.recent-events a[href*="/tour/event/"]');
|
||||
recentAnchors.forEach(anchor => {
|
||||
recentEventsAnchorsSeen++;
|
||||
const href = anchor.getAttribute('href');
|
||||
if (href) {
|
||||
const absoluteUrl = new URL(href, location.origin).href;
|
||||
if (seenUrls.has(absoluteUrl)) {
|
||||
recentEventsSkippedDuplicates++;
|
||||
} else {
|
||||
seenUrls.add(absoluteUrl);
|
||||
urls.push({
|
||||
url: absoluteUrl,
|
||||
date: null,
|
||||
name: anchor.innerText.trim() || 'Recent event',
|
||||
source: 'recent-events'
|
||||
});
|
||||
recentEvents++;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return { urls, counts: { table, recentEvents, recentEventsAnchorsSeen, recentEventsSkippedDuplicates } };
|
||||
}, afterDate.getTime());
|
||||
|
||||
logger.info(`Found ${newTournamentUrls.length} new tournaments after ${afterDate.toDateString()}`);
|
||||
|
||||
logger.info({
|
||||
pdgaNumber,
|
||||
afterDate: afterDate.toISOString(),
|
||||
tableMatches: counts.table,
|
||||
recentEventsMatches: counts.recentEvents,
|
||||
recentEventsAnchorsSeen: counts.recentEventsAnchorsSeen,
|
||||
recentEventsSkippedDuplicates: counts.recentEventsSkippedDuplicates,
|
||||
totalUrlsToScrape: newTournamentUrls.length
|
||||
}, 'new tournament URL discovery completed');
|
||||
|
||||
for (const tournamentData of newTournamentUrls) {
|
||||
try {
|
||||
logger.info(`Scraping new tournament: ${tournamentData.name} (${tournamentData.date})`);
|
||||
|
||||
if (tournamentData.source === 'recent-events') {
|
||||
logger.debug({ pdgaNumber, url: tournamentData.url }, 'recent-events: scraping tournament');
|
||||
} else {
|
||||
logger.info(`Scraping new tournament: ${tournamentData.name} (${tournamentData.date})`);
|
||||
}
|
||||
|
||||
await page.goto(tournamentData.url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
|
||||
|
||||
let parsedDate;
|
||||
if (tournamentData.date !== null) {
|
||||
parsedDate = parseDate(tournamentData.date);
|
||||
} else {
|
||||
const eventDateStr = await page.evaluate(() => {
|
||||
const body = document.body ? document.body.innerText : '';
|
||||
const m = body.match(/\d{1,2}\s+to\s+\d{1,2}-[A-Za-z]{3}-\d{4}/)
|
||||
|| body.match(/\d{1,2}-[A-Za-z]{3}-\d{4}/);
|
||||
return m ? m[0] : null;
|
||||
});
|
||||
|
||||
if (eventDateStr) {
|
||||
parsedDate = parseDate(eventDateStr);
|
||||
if (!(parsedDate > afterDate)) {
|
||||
logger.warn({
|
||||
pdgaNumber,
|
||||
url: tournamentData.url,
|
||||
eventDateStr,
|
||||
parsedDate: parsedDate ? parsedDate.toISOString() : null,
|
||||
afterDate: afterDate.toISOString()
|
||||
}, 'recent-events: extracted event date is not newer than afterDate, likely captured a non-tournament date — skipping');
|
||||
continue;
|
||||
}
|
||||
logger.debug({ pdgaNumber, url: tournamentData.url, eventDateStr }, 'recent-events: extracted date from event page');
|
||||
} else {
|
||||
logger.warn({ pdgaNumber, url: tournamentData.url }, 'recent-events: could not extract date from event page, skipping tournament');
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const roundRatings = await page.evaluate((pdgaNum) => {
|
||||
const rows = document.querySelectorAll('tr');
|
||||
|
||||
|
||||
for (const row of rows) {
|
||||
const cells = row.querySelectorAll('td');
|
||||
const hasPlayerNumber = Array.from(cells).some(cell =>
|
||||
const hasPlayerNumber = Array.from(cells).some(cell =>
|
||||
cell.innerText && cell.innerText.includes(pdgaNum.toString())
|
||||
);
|
||||
|
||||
|
||||
if (hasPlayerNumber) {
|
||||
const roundRatingCells = row.querySelectorAll('td.round-rating');
|
||||
const ratings = [];
|
||||
|
||||
|
||||
roundRatingCells.forEach(cell => {
|
||||
const rating = parseInt(cell.innerText.trim());
|
||||
if (!isNaN(rating) && rating > 0) {
|
||||
ratings.push(rating);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
return ratings;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return [];
|
||||
}, pdgaNumber);
|
||||
|
||||
|
||||
if (roundRatings.length > 0) {
|
||||
const parsedDate = parseDate(tournamentData.date);
|
||||
roundRatings.forEach(rating => {
|
||||
newRounds.push({
|
||||
rating,
|
||||
@@ -238,10 +311,10 @@ async function getNewTournamentRounds(browser, pdgaNumber, afterDate) {
|
||||
competition: tournamentData.name
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
logger.info(`Found ${roundRatings.length} round ratings for ${tournamentData.name}`);
|
||||
}
|
||||
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Error scraping tournament ${tournamentData.name}: ${error.message}`);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user