fix: parse latest tournament from recent-events list on player page (#24)
This commit is contained in:
@@ -156,13 +156,18 @@ async function getNewTournamentRounds(browser, pdgaNumber, afterDate) {
|
|||||||
|
|
||||||
logger.info(`Looking for tournaments after ${afterDate.toDateString()}...`);
|
logger.info(`Looking for tournaments after ${afterDate.toDateString()}...`);
|
||||||
|
|
||||||
const newTournamentUrls = await page.evaluate((afterTimestamp) => {
|
const { urls: newTournamentUrls, counts } = await page.evaluate((afterTimestamp) => {
|
||||||
const afterDate = new Date(afterTimestamp);
|
const afterDate = new Date(afterTimestamp);
|
||||||
const tables = document.querySelectorAll('table[id*="player-results"]');
|
const tables = document.querySelectorAll('table[id*="player-results"]');
|
||||||
const urls = [];
|
const urls = [];
|
||||||
|
const seenUrls = new Set();
|
||||||
|
let table = 0;
|
||||||
|
let recentEvents = 0;
|
||||||
|
let recentEventsAnchorsSeen = 0;
|
||||||
|
let recentEventsSkippedDuplicates = 0;
|
||||||
|
|
||||||
tables.forEach(table => {
|
tables.forEach(tbl => {
|
||||||
const rows = table.querySelectorAll('tbody tr');
|
const rows = tbl.querySelectorAll('tbody tr');
|
||||||
rows.forEach(row => {
|
rows.forEach(row => {
|
||||||
const dateCell = row.querySelector('.dates');
|
const dateCell = row.querySelector('.dates');
|
||||||
const tournamentCell = row.querySelector('.tournament a');
|
const tournamentCell = row.querySelector('.tournament a');
|
||||||
@@ -178,11 +183,17 @@ async function getNewTournamentRounds(browser, pdgaNumber, afterDate) {
|
|||||||
if (date > afterDate) {
|
if (date > afterDate) {
|
||||||
const href = tournamentCell.getAttribute('href');
|
const href = tournamentCell.getAttribute('href');
|
||||||
if (href) {
|
if (href) {
|
||||||
urls.push({
|
const absoluteUrl = new URL(href, location.origin).href;
|
||||||
url: `https://www.pdga.com${href}`,
|
if (!seenUrls.has(absoluteUrl)) {
|
||||||
date: dateStr,
|
seenUrls.add(absoluteUrl);
|
||||||
name: tournamentCell.innerText.trim()
|
urls.push({
|
||||||
});
|
url: absoluteUrl,
|
||||||
|
date: dateStr,
|
||||||
|
name: tournamentCell.innerText.trim(),
|
||||||
|
source: 'table'
|
||||||
|
});
|
||||||
|
table++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -190,18 +201,71 @@ async function getNewTournamentRounds(browser, pdgaNumber, afterDate) {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
return urls;
|
const recentAnchors = document.querySelectorAll('.recent-events a[href*="/tour/event/"]');
|
||||||
|
recentAnchors.forEach(anchor => {
|
||||||
|
recentEventsAnchorsSeen++;
|
||||||
|
const href = anchor.getAttribute('href');
|
||||||
|
if (href) {
|
||||||
|
const absoluteUrl = new URL(href, location.origin).href;
|
||||||
|
if (seenUrls.has(absoluteUrl)) {
|
||||||
|
recentEventsSkippedDuplicates++;
|
||||||
|
} else {
|
||||||
|
seenUrls.add(absoluteUrl);
|
||||||
|
urls.push({
|
||||||
|
url: absoluteUrl,
|
||||||
|
date: null,
|
||||||
|
name: anchor.innerText.trim() || 'Recent event',
|
||||||
|
source: 'recent-events'
|
||||||
|
});
|
||||||
|
recentEvents++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return { urls, counts: { table, recentEvents, recentEventsAnchorsSeen, recentEventsSkippedDuplicates } };
|
||||||
}, afterDate.getTime());
|
}, afterDate.getTime());
|
||||||
|
|
||||||
logger.info(`Found ${newTournamentUrls.length} new tournaments after ${afterDate.toDateString()}`);
|
logger.info({
|
||||||
|
pdgaNumber,
|
||||||
|
afterDate: afterDate.toISOString(),
|
||||||
|
tableMatches: counts.table,
|
||||||
|
recentEventsMatches: counts.recentEvents,
|
||||||
|
recentEventsAnchorsSeen: counts.recentEventsAnchorsSeen,
|
||||||
|
recentEventsSkippedDuplicates: counts.recentEventsSkippedDuplicates,
|
||||||
|
totalUrlsToScrape: newTournamentUrls.length
|
||||||
|
}, 'new tournament URL discovery completed');
|
||||||
|
|
||||||
for (const tournamentData of newTournamentUrls) {
|
for (const tournamentData of newTournamentUrls) {
|
||||||
try {
|
try {
|
||||||
logger.info(`Scraping new tournament: ${tournamentData.name} (${tournamentData.date})`);
|
if (tournamentData.source === 'recent-events') {
|
||||||
|
logger.debug({ pdgaNumber, url: tournamentData.url }, 'recent-events: scraping tournament');
|
||||||
|
} else {
|
||||||
|
logger.info(`Scraping new tournament: ${tournamentData.name} (${tournamentData.date})`);
|
||||||
|
}
|
||||||
|
|
||||||
await page.goto(tournamentData.url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
await page.goto(tournamentData.url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||||
await new Promise(r => setTimeout(r, 500));
|
await new Promise(r => setTimeout(r, 500));
|
||||||
|
|
||||||
|
let parsedDate;
|
||||||
|
if (tournamentData.date !== null) {
|
||||||
|
parsedDate = parseDate(tournamentData.date);
|
||||||
|
} else {
|
||||||
|
const eventDateStr = await page.evaluate(() => {
|
||||||
|
const body = document.body ? document.body.innerText : '';
|
||||||
|
const m = body.match(/\d{1,2}\s+to\s+\d{1,2}-[A-Za-z]{3}-\d{4}/)
|
||||||
|
|| body.match(/\d{1,2}-[A-Za-z]{3}-\d{4}/);
|
||||||
|
return m ? m[0] : null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (eventDateStr) {
|
||||||
|
parsedDate = parseDate(eventDateStr);
|
||||||
|
logger.debug({ pdgaNumber, url: tournamentData.url, eventDateStr }, 'recent-events: extracted date from event page');
|
||||||
|
} else {
|
||||||
|
logger.warn({ pdgaNumber, url: tournamentData.url }, 'recent-events: could not extract date from event page, skipping tournament');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const roundRatings = await page.evaluate((pdgaNum) => {
|
const roundRatings = await page.evaluate((pdgaNum) => {
|
||||||
const rows = document.querySelectorAll('tr');
|
const rows = document.querySelectorAll('tr');
|
||||||
|
|
||||||
@@ -230,7 +294,6 @@ async function getNewTournamentRounds(browser, pdgaNumber, afterDate) {
|
|||||||
}, pdgaNumber);
|
}, pdgaNumber);
|
||||||
|
|
||||||
if (roundRatings.length > 0) {
|
if (roundRatings.length > 0) {
|
||||||
const parsedDate = parseDate(tournamentData.date);
|
|
||||||
roundRatings.forEach(rating => {
|
roundRatings.forEach(rating => {
|
||||||
newRounds.push({
|
newRounds.push({
|
||||||
rating,
|
rating,
|
||||||
|
|||||||
Reference in New Issue
Block a user