d567c4bca9
- Switch from Alpine to Debian slim for correct Chromium architecture
(fixes ARM/Apple Silicon support)
- Upgrade Puppeteer 21 to 24, use system Chromium via PUPPETEER_EXECUTABLE_PATH
- Replace removed page.waitForTimeout() with setTimeout
- Set NODE_ENV=production in Dockerfile to prevent pino-pretty import
- Improve error logging with Pino's { err: error } pattern
- Add build: . to docker-compose for local development builds
351 lines
11 KiB
JavaScript
351 lines
11 KiB
JavaScript
const { saveCourseToDB, saveLayoutToDB } = require('../models/course');
|
|
const logger = require('../logger');
|
|
|
|
// In-memory cache for layout-division-event mapping
|
|
const layoutEventCache = new Map();
|
|
|
|
function getLayoutEventCache() {
|
|
return layoutEventCache;
|
|
}
|
|
|
|
async function scrapeCourseDirectory(browser) {
|
|
logger.info('Scraping Swedish courses from PDGA course directory');
|
|
const page = await browser.newPage();
|
|
const allCourses = [];
|
|
let pageNumber = 0;
|
|
let hasMorePages = true;
|
|
|
|
try {
|
|
while (hasMorePages) {
|
|
const url = `https://www.pdga.com/course-directory/advanced?title=&field_course_location_country=SE&field_course_location_locality=&field_course_location_administrative_area=All&field_course_location_postal_code=&field_course_type_value=All&rating_value=All&field_course_holes_value=18-100&field_course_total_length_value=All&field_course_target_type_value=All&field_course_tee_type_value=All&field_location_type_value=All&field_course_camping_value=All&field_course_facilities_value=All&field_course_fees_value=All&field_course_handicap_value=All&field_course_private_value=All&field_course_signage_value=All&field_cart_friendly_value=All&page=${pageNumber}`;
|
|
|
|
logger.info(`Scraping page ${pageNumber}...`);
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 45000 });
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
|
|
const courses = await page.evaluate(() => {
|
|
const courseData = [];
|
|
const rows = document.querySelectorAll('table tbody tr');
|
|
|
|
rows.forEach(row => {
|
|
const titleCell = row.querySelector('td.views-field-title');
|
|
const locationCell = row.querySelector('td.views-field-field-course-location');
|
|
|
|
if (titleCell) {
|
|
const link = titleCell.querySelector('a');
|
|
if (link) {
|
|
courseData.push({
|
|
name: link.innerText.trim(),
|
|
link: 'https://www.pdga.com' + link.getAttribute('href'),
|
|
city: locationCell ? locationCell.innerText.trim() : 'Unknown'
|
|
});
|
|
}
|
|
}
|
|
});
|
|
|
|
return courseData;
|
|
});
|
|
|
|
if (courses.length === 0) {
|
|
logger.info(`No courses found on page ${pageNumber}, stopping pagination`);
|
|
hasMorePages = false;
|
|
} else {
|
|
logger.info(`Found ${courses.length} courses on page ${pageNumber}`);
|
|
allCourses.push(...courses);
|
|
|
|
for (const course of courses) {
|
|
try {
|
|
await saveCourseToDB(course);
|
|
logger.info(`Saved course: ${course.name} (${course.city})`);
|
|
} catch (err) {
|
|
logger.error(`Error saving course ${course.name}: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
pageNumber++;
|
|
|
|
if (hasMorePages) {
|
|
logger.info('Waiting 2s before next page...');
|
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
}
|
|
}
|
|
}
|
|
|
|
logger.info(`Total courses scraped: ${allCourses.length} across ${pageNumber} pages`);
|
|
|
|
} catch (error) {
|
|
logger.error({ err: error }, 'Error scraping course directory');
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
|
|
return allCourses;
|
|
}
|
|
|
|
async function scrapeCourseLayouts(browser, courseLink, courseId) {
|
|
logger.info(`Scraping layouts from: ${courseLink}`);
|
|
const page = await browser.newPage();
|
|
const layouts = [];
|
|
|
|
try {
|
|
await page.goto(courseLink, { waitUntil: 'networkidle2', timeout: 45000 });
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
|
|
const layoutsTabClicked = await page.evaluate(() => {
|
|
const selectors = [
|
|
'a.quicktabs-tab-course_node-2',
|
|
'li.quicktabs-tab-course_node-2 a',
|
|
'a[href*="layouts"]',
|
|
'.quicktabs-tabs a',
|
|
'ul.quicktabs-tabs a',
|
|
'.quicktabs-wrapper a'
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const tabs = document.querySelectorAll(selector);
|
|
for (const tab of tabs) {
|
|
const text = tab.innerText?.trim();
|
|
if (text && (text.includes('Layouts') || text.includes('Layout'))) {
|
|
tab.click();
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
});
|
|
|
|
if (layoutsTabClicked) {
|
|
logger.info('Layouts tab found and clicked');
|
|
await new Promise(r => setTimeout(r, 3000));
|
|
} else {
|
|
logger.warn('Layouts tab not found - may be on a single-layout course page');
|
|
}
|
|
|
|
const extractedLayouts = await page.evaluate(() => {
|
|
const layoutData = [];
|
|
const tournamentsDiv = document.querySelector('div.tournaments');
|
|
|
|
if (!tournamentsDiv) {
|
|
return layoutData;
|
|
}
|
|
|
|
const tournamentCourses = tournamentsDiv.querySelectorAll('details.tournament-course');
|
|
|
|
tournamentCourses.forEach((details) => {
|
|
const resultsDiv = details.querySelector('div.results');
|
|
const resultsLink = resultsDiv ? resultsDiv.querySelector('a') : null;
|
|
const eventUrl = resultsLink ? resultsLink.getAttribute('href') : null;
|
|
const fullEventUrl = eventUrl ? 'https://www.pdga.com' + eventUrl : null;
|
|
|
|
const layoutsDiv = details.querySelector('div.layouts');
|
|
if (!layoutsDiv) {
|
|
return;
|
|
}
|
|
|
|
const layoutDivs = layoutsDiv.querySelectorAll('div.layout');
|
|
|
|
layoutDivs.forEach((layoutDiv) => {
|
|
const h4WithClass = layoutDiv.querySelector('h4.title');
|
|
const h4Any = layoutDiv.querySelector('h4');
|
|
|
|
let layoutName = '';
|
|
if (h4WithClass) {
|
|
layoutName = (h4WithClass.textContent || h4WithClass.innerText || '').trim();
|
|
} else if (h4Any) {
|
|
layoutName = (h4Any.textContent || h4Any.innerText || '').trim();
|
|
}
|
|
|
|
const allText = layoutDiv.textContent || layoutDiv.innerText || '';
|
|
|
|
const parPatterns = [
|
|
/Par[:\s]+(\d+)/i,
|
|
/Par\s*=\s*(\d+)/i,
|
|
/\(Par\s+(\d+)\)/i,
|
|
/Total Par:\s*(\d+)/i
|
|
];
|
|
|
|
let par = null;
|
|
for (const pattern of parPatterns) {
|
|
const match = allText.match(pattern);
|
|
if (match) {
|
|
par = parseInt(match[1]);
|
|
break;
|
|
}
|
|
}
|
|
|
|
const divisionsLi = layoutDiv.querySelector('li.divisions');
|
|
let divisions = [];
|
|
if (divisionsLi) {
|
|
const divisionsText = (divisionsLi.textContent || '').replace('Divisions:', '').trim();
|
|
divisions = divisionsText.split(/[,\s]+/).filter(d => d.length > 0);
|
|
}
|
|
|
|
if (layoutName && par && !isNaN(par) && par > 0) {
|
|
layoutData.push({
|
|
name: layoutName,
|
|
par: par,
|
|
divisions: divisions,
|
|
eventUrl: fullEventUrl
|
|
});
|
|
}
|
|
});
|
|
});
|
|
|
|
return layoutData;
|
|
});
|
|
|
|
layouts.push(...extractedLayouts);
|
|
|
|
const courseIdInt = typeof courseId === 'string' ? parseInt(courseId) : courseId;
|
|
layoutEventCache.set(courseIdInt, layouts);
|
|
|
|
logger.info(`Successfully parsed ${layouts.length} layouts from course page`);
|
|
|
|
const uniqueLayouts = [];
|
|
const seen = new Set();
|
|
|
|
for (const layout of layouts) {
|
|
const key = `${layout.name}|${layout.par}`;
|
|
if (!seen.has(key)) {
|
|
seen.add(key);
|
|
uniqueLayouts.push(layout);
|
|
}
|
|
}
|
|
|
|
if (uniqueLayouts.length < layouts.length) {
|
|
logger.info(`Deduplicated to ${uniqueLayouts.length} unique layouts`);
|
|
}
|
|
|
|
for (const layout of uniqueLayouts) {
|
|
try {
|
|
await saveLayoutToDB(courseId, layout);
|
|
logger.info(`Saved layout: ${layout.name} (Par ${layout.par})`);
|
|
} catch (err) {
|
|
logger.error(`Error saving layout ${layout.name}: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
} catch (error) {
|
|
logger.error({ err: error }, 'Error scraping course layouts');
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
|
|
return layouts;
|
|
}
|
|
|
|
async function scrapeEventResults(browser, eventUrl, layoutsWithDivisions) {
|
|
const page = await browser.newPage();
|
|
const layoutRatings = {};
|
|
|
|
try {
|
|
await page.goto(eventUrl, { waitUntil: 'networkidle2', timeout: 45000 });
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
|
|
const eventDateRaw = await page.evaluate(() => {
|
|
const allText = document.body.textContent;
|
|
const datePattern = /\d{1,2}-[A-Z][a-z]{2}-\d{4}/;
|
|
const match = allText.match(datePattern);
|
|
return match ? match[0] : null;
|
|
});
|
|
|
|
let eventDate = null;
|
|
if (eventDateRaw) {
|
|
try {
|
|
const parsedDate = new Date(eventDateRaw);
|
|
if (!isNaN(parsedDate.getTime())) {
|
|
eventDate = parsedDate.toISOString().split('T')[0];
|
|
}
|
|
} catch (e) {
|
|
// Ignore date parsing errors
|
|
}
|
|
}
|
|
|
|
for (const layout of layoutsWithDivisions) {
|
|
const layoutKey = `${layout.name}|${layout.par}`;
|
|
const ratingsForLayout = [];
|
|
|
|
for (const division of layout.divisions) {
|
|
const divisionData = await page.evaluate((divisionName, targetPar) => {
|
|
const divisionH3 = document.querySelector(`h3#${divisionName}`);
|
|
if (!divisionH3) {
|
|
return { found: false, ratings: [] };
|
|
}
|
|
|
|
const detailsTag = divisionH3.closest('details');
|
|
if (!detailsTag) {
|
|
return { found: false, ratings: [] };
|
|
}
|
|
|
|
const table = detailsTag.querySelector('table.results');
|
|
if (!table) {
|
|
return { found: false, ratings: [] };
|
|
}
|
|
|
|
const ratings = [];
|
|
const rows = table.querySelectorAll('tbody tr');
|
|
|
|
rows.forEach(row => {
|
|
const roundCells = row.querySelectorAll('td.round');
|
|
|
|
roundCells.forEach(roundCell => {
|
|
const scoreText = (roundCell.textContent || '').trim();
|
|
const scoreMatch = scoreText.match(/^(\d+)$/);
|
|
|
|
if (scoreMatch) {
|
|
const scoreValue = parseInt(scoreMatch[1]);
|
|
|
|
if (scoreValue === targetPar) {
|
|
const ratingCell = roundCell.nextElementSibling;
|
|
|
|
if (ratingCell && ratingCell.classList.contains('round-rating')) {
|
|
const ratingText = (ratingCell.textContent || '').trim();
|
|
const rating = parseInt(ratingText);
|
|
|
|
if (!isNaN(rating) && rating > 0) {
|
|
ratings.push(rating);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
});
|
|
});
|
|
|
|
return { found: true, ratings: ratings };
|
|
}, division, layout.par);
|
|
|
|
if (divisionData.found && divisionData.ratings.length > 0) {
|
|
ratingsForLayout.push(...divisionData.ratings);
|
|
}
|
|
}
|
|
|
|
if (ratingsForLayout.length > 0) {
|
|
const meanRating = ratingsForLayout.reduce((sum, r) => sum + r, 0) / ratingsForLayout.length;
|
|
layoutRatings[layoutKey] = {
|
|
name: layout.name,
|
|
par: layout.par,
|
|
ratings: ratingsForLayout,
|
|
count: ratingsForLayout.length,
|
|
meanRating: Math.round(meanRating),
|
|
eventDate: eventDate
|
|
};
|
|
}
|
|
}
|
|
|
|
} catch (error) {
|
|
logger.error({ err: error }, 'Error scraping event results');
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
|
|
return layoutRatings;
|
|
}
|
|
|
|
module.exports = {
|
|
layoutEventCache,
|
|
getLayoutEventCache,
|
|
scrapeCourseDirectory,
|
|
scrapeCourseLayouts,
|
|
scrapeEventResults
|
|
};
|