updated yahoo company extraction js to get the most data rich row

This commit is contained in:
2025-12-19 14:43:36 +01:00
parent b366f366e6
commit 5e81959322

View File

@@ -45,33 +45,93 @@ var extractionResult = (function() {
return text; return text;
} }
// Helper function to check if value is valid (not empty, not -, not N/A) // Helper function to check if a cell actually contains data
function isValidValue(value) { // Multiple indicators are used to determine if data is present
if (!value) return false; function hasValidData(cellElement) {
const normalized = value.trim().toLowerCase(); if (!cellElement) return false;
return normalized !== '' && normalized !== '-' && normalized !== 'n/a';
// Indicator 1: Check if the cell contains a <p> tag (Yahoo uses this for "no data")
const pTag = cellElement.querySelector('p');
if (pTag) return false;
// Indicator 2: Check the direct child structure
// Valid data cells have: td > span > div or td > span > div > a
// Invalid data cells have: td > span > p
const span = cellElement.querySelector('span');
if (span) {
const directChildren = Array.from(span.children);
// If the only or first child is a <p>, it's likely "no data"
if (directChildren.length > 0 && directChildren[0].tagName === 'P') {
return false;
}
}
// Indicator 3: Check text content
const text = extractText(cellElement);
if (!text) return false;
const normalized = text.toLowerCase().trim();
// Common "no data" indicators
const noDataIndicators = [
'-',
'n/a',
'na',
'none',
'not available',
'no data',
'--',
'—', // em dash
'', // en dash
];
if (noDataIndicators.includes(normalized)) {
return false;
}
// Indicator 4: Check for common CSS classes that indicate empty state
const classIndicators = ['empty', 'no-data', 'na', 'null', 'undefined'];
const classList = cellElement.className || '';
for (const indicator of classIndicators) {
if (classList.includes(indicator)) {
return false;
}
}
// Indicator 5: Check if cell has an anchor tag (usually indicates real data)
const hasLink = cellElement.querySelector('a') !== null;
// Indicator 6: Check if there's actual substantial content
// If text is very short (1-2 chars) and not alphanumeric, it's likely not real data
if (text.length <= 2 && !/[a-zA-Z0-9]/.test(text)) {
return false;
}
// If we passed all checks, consider it valid data
return true;
}
// Helper function to extract and normalize data from a cell
function extractCellData(cellElement) {
if (!cellElement) return null;
if (!hasValidData(cellElement)) return null;
const text = extractText(cellElement);
return text || null;
} }
// Helper function to extract and normalize data from a row // Helper function to extract and normalize data from a row
function extractRowData(row) { function extractRowData(row) {
// Extract ticker from column 1 (td:nth-child(1) > span > div > a) // Extract ticker from column 1 (td:nth-child(1))
const tickerElement = row.querySelector('td:nth-child(1) > span > div > a') || const tickerCell = row.querySelector('td:nth-child(1)');
row.querySelector('td:nth-child(1)'); const ticker = extractCellData(tickerCell);
const tickerRaw = extractText(tickerElement);
const ticker = isValidValue(tickerRaw) ? tickerRaw : null;
// Extract sector from column 4 (td:nth-child(4) > span > div > a or td:nth-child(4) > span > div) // Extract sector from column 4 (td:nth-child(4))
const sectorElement = row.querySelector('td:nth-child(4) > span > div > a') || const sectorCell = row.querySelector('td:nth-child(4)');
row.querySelector('td:nth-child(4) > span > div') || const sector = extractCellData(sectorCell);
row.querySelector('td:nth-child(4)');
const sectorRaw = extractText(sectorElement);
const sector = isValidValue(sectorRaw) ? sectorRaw : null;
// Extract exchange from column 6 (td:nth-child(6) > span > div) // Extract exchange from column 6 (td:nth-child(6))
const exchangeElement = row.querySelector('td:nth-child(6) > span > div') || const exchangeCell = row.querySelector('td:nth-child(6)');
row.querySelector('td:nth-child(6)'); const exchange = extractCellData(exchangeCell);
const exchangeRaw = extractText(exchangeElement);
const exchange = isValidValue(exchangeRaw) ? exchangeRaw : null;
return { ticker, sector, exchange }; return { ticker, sector, exchange };
} }
@@ -85,21 +145,37 @@ var extractionResult = (function() {
return count; return count;
} }
// Helper function to score a row (prioritize rows with more complete data)
function scoreRow(data) {
let score = 0;
// Ticker is mandatory and gets highest weight
if (data.ticker) score += 100;
// Sector and exchange are nice-to-have
if (data.sector) score += 10;
if (data.exchange) score += 10;
return score;
}
// Extract data from all rows and find the one with most complete data // Extract data from all rows and find the one with most complete data
let bestRow = null; let bestRow = null;
let maxFieldCount = -1; let maxScore = -1;
let rowIndex = 0; let rowIndex = 0;
for (const row of allRows) { for (const row of allRows) {
const data = extractRowData(row); const data = extractRowData(row);
const fieldCount = countValidFields(data); const score = scoreRow(data);
// Select row with most valid data, or first row if tied // Select row with highest score (most complete data)
if (fieldCount > maxFieldCount) { // If tied, first row wins
if (score > maxScore) {
bestRow = data; bestRow = data;
maxFieldCount = fieldCount; maxScore = score;
bestRow.rowIndex = rowIndex; bestRow.rowIndex = rowIndex;
bestRow.validFieldCount = fieldCount; bestRow.validFieldCount = countValidFields(data);
bestRow.score = score;
} }
rowIndex++; rowIndex++;
@@ -126,6 +202,7 @@ var extractionResult = (function() {
metadata: { metadata: {
selectedRowIndex: bestRow.rowIndex, selectedRowIndex: bestRow.rowIndex,
validFieldCount: bestRow.validFieldCount, validFieldCount: bestRow.validFieldCount,
score: bestRow.score,
totalRows: allRows.length totalRows: allRows.length
} }
}; };