Files
pdga-rating/src/scrapers/course-puppeteer.js
T
Samuel Enocsson d567c4bca9 fix: upgrade Node 18 to 22 and fix Puppeteer compatibility
- Switch from Alpine to Debian slim for correct Chromium architecture
  (fixes ARM/Apple Silicon support)
- Upgrade Puppeteer 21 to 24, use system Chromium via PUPPETEER_EXECUTABLE_PATH
- Replace removed page.waitForTimeout() with setTimeout
- Set NODE_ENV=production in Dockerfile to prevent pino-pretty import
- Improve error logging with Pino's { err: error } pattern
- Add build: . to docker-compose for local development builds
2026-03-20 07:39:34 +01:00

351 lines
11 KiB
JavaScript

const { saveCourseToDB, saveLayoutToDB } = require('../models/course');
const logger = require('../logger');
// In-memory cache for layout-division-event mapping
const layoutEventCache = new Map();
function getLayoutEventCache() {
return layoutEventCache;
}
async function scrapeCourseDirectory(browser) {
logger.info('Scraping Swedish courses from PDGA course directory');
const page = await browser.newPage();
const allCourses = [];
let pageNumber = 0;
let hasMorePages = true;
try {
while (hasMorePages) {
const url = `https://www.pdga.com/course-directory/advanced?title=&field_course_location_country=SE&field_course_location_locality=&field_course_location_administrative_area=All&field_course_location_postal_code=&field_course_type_value=All&rating_value=All&field_course_holes_value=18-100&field_course_total_length_value=All&field_course_target_type_value=All&field_course_tee_type_value=All&field_location_type_value=All&field_course_camping_value=All&field_course_facilities_value=All&field_course_fees_value=All&field_course_handicap_value=All&field_course_private_value=All&field_course_signage_value=All&field_cart_friendly_value=All&page=${pageNumber}`;
logger.info(`Scraping page ${pageNumber}...`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 45000 });
await new Promise(r => setTimeout(r, 1000));
const courses = await page.evaluate(() => {
const courseData = [];
const rows = document.querySelectorAll('table tbody tr');
rows.forEach(row => {
const titleCell = row.querySelector('td.views-field-title');
const locationCell = row.querySelector('td.views-field-field-course-location');
if (titleCell) {
const link = titleCell.querySelector('a');
if (link) {
courseData.push({
name: link.innerText.trim(),
link: 'https://www.pdga.com' + link.getAttribute('href'),
city: locationCell ? locationCell.innerText.trim() : 'Unknown'
});
}
}
});
return courseData;
});
if (courses.length === 0) {
logger.info(`No courses found on page ${pageNumber}, stopping pagination`);
hasMorePages = false;
} else {
logger.info(`Found ${courses.length} courses on page ${pageNumber}`);
allCourses.push(...courses);
for (const course of courses) {
try {
await saveCourseToDB(course);
logger.info(`Saved course: ${course.name} (${course.city})`);
} catch (err) {
logger.error(`Error saving course ${course.name}: ${err.message}`);
}
}
pageNumber++;
if (hasMorePages) {
logger.info('Waiting 2s before next page...');
await new Promise(resolve => setTimeout(resolve, 2000));
}
}
}
logger.info(`Total courses scraped: ${allCourses.length} across ${pageNumber} pages`);
} catch (error) {
logger.error({ err: error }, 'Error scraping course directory');
} finally {
await page.close();
}
return allCourses;
}
async function scrapeCourseLayouts(browser, courseLink, courseId) {
logger.info(`Scraping layouts from: ${courseLink}`);
const page = await browser.newPage();
const layouts = [];
try {
await page.goto(courseLink, { waitUntil: 'networkidle2', timeout: 45000 });
await new Promise(r => setTimeout(r, 1000));
const layoutsTabClicked = await page.evaluate(() => {
const selectors = [
'a.quicktabs-tab-course_node-2',
'li.quicktabs-tab-course_node-2 a',
'a[href*="layouts"]',
'.quicktabs-tabs a',
'ul.quicktabs-tabs a',
'.quicktabs-wrapper a'
];
for (const selector of selectors) {
const tabs = document.querySelectorAll(selector);
for (const tab of tabs) {
const text = tab.innerText?.trim();
if (text && (text.includes('Layouts') || text.includes('Layout'))) {
tab.click();
return true;
}
}
}
return false;
});
if (layoutsTabClicked) {
logger.info('Layouts tab found and clicked');
await new Promise(r => setTimeout(r, 3000));
} else {
logger.warn('Layouts tab not found - may be on a single-layout course page');
}
const extractedLayouts = await page.evaluate(() => {
const layoutData = [];
const tournamentsDiv = document.querySelector('div.tournaments');
if (!tournamentsDiv) {
return layoutData;
}
const tournamentCourses = tournamentsDiv.querySelectorAll('details.tournament-course');
tournamentCourses.forEach((details) => {
const resultsDiv = details.querySelector('div.results');
const resultsLink = resultsDiv ? resultsDiv.querySelector('a') : null;
const eventUrl = resultsLink ? resultsLink.getAttribute('href') : null;
const fullEventUrl = eventUrl ? 'https://www.pdga.com' + eventUrl : null;
const layoutsDiv = details.querySelector('div.layouts');
if (!layoutsDiv) {
return;
}
const layoutDivs = layoutsDiv.querySelectorAll('div.layout');
layoutDivs.forEach((layoutDiv) => {
const h4WithClass = layoutDiv.querySelector('h4.title');
const h4Any = layoutDiv.querySelector('h4');
let layoutName = '';
if (h4WithClass) {
layoutName = (h4WithClass.textContent || h4WithClass.innerText || '').trim();
} else if (h4Any) {
layoutName = (h4Any.textContent || h4Any.innerText || '').trim();
}
const allText = layoutDiv.textContent || layoutDiv.innerText || '';
const parPatterns = [
/Par[:\s]+(\d+)/i,
/Par\s*=\s*(\d+)/i,
/\(Par\s+(\d+)\)/i,
/Total Par:\s*(\d+)/i
];
let par = null;
for (const pattern of parPatterns) {
const match = allText.match(pattern);
if (match) {
par = parseInt(match[1]);
break;
}
}
const divisionsLi = layoutDiv.querySelector('li.divisions');
let divisions = [];
if (divisionsLi) {
const divisionsText = (divisionsLi.textContent || '').replace('Divisions:', '').trim();
divisions = divisionsText.split(/[,\s]+/).filter(d => d.length > 0);
}
if (layoutName && par && !isNaN(par) && par > 0) {
layoutData.push({
name: layoutName,
par: par,
divisions: divisions,
eventUrl: fullEventUrl
});
}
});
});
return layoutData;
});
layouts.push(...extractedLayouts);
const courseIdInt = typeof courseId === 'string' ? parseInt(courseId) : courseId;
layoutEventCache.set(courseIdInt, layouts);
logger.info(`Successfully parsed ${layouts.length} layouts from course page`);
const uniqueLayouts = [];
const seen = new Set();
for (const layout of layouts) {
const key = `${layout.name}|${layout.par}`;
if (!seen.has(key)) {
seen.add(key);
uniqueLayouts.push(layout);
}
}
if (uniqueLayouts.length < layouts.length) {
logger.info(`Deduplicated to ${uniqueLayouts.length} unique layouts`);
}
for (const layout of uniqueLayouts) {
try {
await saveLayoutToDB(courseId, layout);
logger.info(`Saved layout: ${layout.name} (Par ${layout.par})`);
} catch (err) {
logger.error(`Error saving layout ${layout.name}: ${err.message}`);
}
}
} catch (error) {
logger.error({ err: error }, 'Error scraping course layouts');
} finally {
await page.close();
}
return layouts;
}
async function scrapeEventResults(browser, eventUrl, layoutsWithDivisions) {
const page = await browser.newPage();
const layoutRatings = {};
try {
await page.goto(eventUrl, { waitUntil: 'networkidle2', timeout: 45000 });
await new Promise(r => setTimeout(r, 1000));
const eventDateRaw = await page.evaluate(() => {
const allText = document.body.textContent;
const datePattern = /\d{1,2}-[A-Z][a-z]{2}-\d{4}/;
const match = allText.match(datePattern);
return match ? match[0] : null;
});
let eventDate = null;
if (eventDateRaw) {
try {
const parsedDate = new Date(eventDateRaw);
if (!isNaN(parsedDate.getTime())) {
eventDate = parsedDate.toISOString().split('T')[0];
}
} catch (e) {
// Ignore date parsing errors
}
}
for (const layout of layoutsWithDivisions) {
const layoutKey = `${layout.name}|${layout.par}`;
const ratingsForLayout = [];
for (const division of layout.divisions) {
const divisionData = await page.evaluate((divisionName, targetPar) => {
const divisionH3 = document.querySelector(`h3#${divisionName}`);
if (!divisionH3) {
return { found: false, ratings: [] };
}
const detailsTag = divisionH3.closest('details');
if (!detailsTag) {
return { found: false, ratings: [] };
}
const table = detailsTag.querySelector('table.results');
if (!table) {
return { found: false, ratings: [] };
}
const ratings = [];
const rows = table.querySelectorAll('tbody tr');
rows.forEach(row => {
const roundCells = row.querySelectorAll('td.round');
roundCells.forEach(roundCell => {
const scoreText = (roundCell.textContent || '').trim();
const scoreMatch = scoreText.match(/^(\d+)$/);
if (scoreMatch) {
const scoreValue = parseInt(scoreMatch[1]);
if (scoreValue === targetPar) {
const ratingCell = roundCell.nextElementSibling;
if (ratingCell && ratingCell.classList.contains('round-rating')) {
const ratingText = (ratingCell.textContent || '').trim();
const rating = parseInt(ratingText);
if (!isNaN(rating) && rating > 0) {
ratings.push(rating);
}
}
}
}
});
});
return { found: true, ratings: ratings };
}, division, layout.par);
if (divisionData.found && divisionData.ratings.length > 0) {
ratingsForLayout.push(...divisionData.ratings);
}
}
if (ratingsForLayout.length > 0) {
const meanRating = ratingsForLayout.reduce((sum, r) => sum + r, 0) / ratingsForLayout.length;
layoutRatings[layoutKey] = {
name: layout.name,
par: layout.par,
ratings: ratingsForLayout,
count: ratingsForLayout.length,
meanRating: Math.round(meanRating),
eventDate: eventDate
};
}
}
} catch (error) {
logger.error({ err: error }, 'Error scraping event results');
} finally {
await page.close();
}
return layoutRatings;
}
module.exports = {
layoutEventCache,
getLayoutEventCache,
scrapeCourseDirectory,
scrapeCourseLayouts,
scrapeEventResults
};