updated yahoo company extraction js to get the most data rich row
This commit is contained in:
@@ -45,33 +45,93 @@ var extractionResult = (function() {
|
||||
return text;
|
||||
}
|
||||
|
||||
// Helper function to check if value is valid (not empty, not -, not N/A)
|
||||
function isValidValue(value) {
|
||||
if (!value) return false;
|
||||
const normalized = value.trim().toLowerCase();
|
||||
return normalized !== '' && normalized !== '-' && normalized !== 'n/a';
|
||||
// Helper function to check if a cell actually contains data
|
||||
// Multiple indicators are used to determine if data is present
|
||||
function hasValidData(cellElement) {
|
||||
if (!cellElement) return false;
|
||||
|
||||
// Indicator 1: Check if the cell contains a <p> tag (Yahoo uses this for "no data")
|
||||
const pTag = cellElement.querySelector('p');
|
||||
if (pTag) return false;
|
||||
|
||||
// Indicator 2: Check the direct child structure
|
||||
// Valid data cells have: td > span > div or td > span > div > a
|
||||
// Invalid data cells have: td > span > p
|
||||
const span = cellElement.querySelector('span');
|
||||
if (span) {
|
||||
const directChildren = Array.from(span.children);
|
||||
// If the only or first child is a <p>, it's likely "no data"
|
||||
if (directChildren.length > 0 && directChildren[0].tagName === 'P') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Indicator 3: Check text content
|
||||
const text = extractText(cellElement);
|
||||
if (!text) return false;
|
||||
const normalized = text.toLowerCase().trim();
|
||||
|
||||
// Common "no data" indicators
|
||||
const noDataIndicators = [
|
||||
'-',
|
||||
'n/a',
|
||||
'na',
|
||||
'none',
|
||||
'not available',
|
||||
'no data',
|
||||
'--',
|
||||
'—', // em dash
|
||||
'–', // en dash
|
||||
];
|
||||
|
||||
if (noDataIndicators.includes(normalized)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Indicator 4: Check for common CSS classes that indicate empty state
|
||||
const classIndicators = ['empty', 'no-data', 'na', 'null', 'undefined'];
|
||||
const classList = cellElement.className || '';
|
||||
for (const indicator of classIndicators) {
|
||||
if (classList.includes(indicator)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Indicator 5: Check if cell has an anchor tag (usually indicates real data)
|
||||
const hasLink = cellElement.querySelector('a') !== null;
|
||||
|
||||
// Indicator 6: Check if there's actual substantial content
|
||||
// If text is very short (1-2 chars) and not alphanumeric, it's likely not real data
|
||||
if (text.length <= 2 && !/[a-zA-Z0-9]/.test(text)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If we passed all checks, consider it valid data
|
||||
return true;
|
||||
}
|
||||
|
||||
// Helper function to extract and normalize data from a cell
|
||||
function extractCellData(cellElement) {
|
||||
if (!cellElement) return null;
|
||||
if (!hasValidData(cellElement)) return null;
|
||||
|
||||
const text = extractText(cellElement);
|
||||
return text || null;
|
||||
}
|
||||
|
||||
// Helper function to extract and normalize data from a row
|
||||
function extractRowData(row) {
|
||||
// Extract ticker from column 1 (td:nth-child(1) > span > div > a)
|
||||
const tickerElement = row.querySelector('td:nth-child(1) > span > div > a') ||
|
||||
row.querySelector('td:nth-child(1)');
|
||||
const tickerRaw = extractText(tickerElement);
|
||||
const ticker = isValidValue(tickerRaw) ? tickerRaw : null;
|
||||
// Extract ticker from column 1 (td:nth-child(1))
|
||||
const tickerCell = row.querySelector('td:nth-child(1)');
|
||||
const ticker = extractCellData(tickerCell);
|
||||
|
||||
// Extract sector from column 4 (td:nth-child(4) > span > div > a or td:nth-child(4) > span > div)
|
||||
const sectorElement = row.querySelector('td:nth-child(4) > span > div > a') ||
|
||||
row.querySelector('td:nth-child(4) > span > div') ||
|
||||
row.querySelector('td:nth-child(4)');
|
||||
const sectorRaw = extractText(sectorElement);
|
||||
const sector = isValidValue(sectorRaw) ? sectorRaw : null;
|
||||
// Extract sector from column 4 (td:nth-child(4))
|
||||
const sectorCell = row.querySelector('td:nth-child(4)');
|
||||
const sector = extractCellData(sectorCell);
|
||||
|
||||
// Extract exchange from column 6 (td:nth-child(6) > span > div)
|
||||
const exchangeElement = row.querySelector('td:nth-child(6) > span > div') ||
|
||||
row.querySelector('td:nth-child(6)');
|
||||
const exchangeRaw = extractText(exchangeElement);
|
||||
const exchange = isValidValue(exchangeRaw) ? exchangeRaw : null;
|
||||
// Extract exchange from column 6 (td:nth-child(6))
|
||||
const exchangeCell = row.querySelector('td:nth-child(6)');
|
||||
const exchange = extractCellData(exchangeCell);
|
||||
|
||||
return { ticker, sector, exchange };
|
||||
}
|
||||
@@ -85,21 +145,37 @@ var extractionResult = (function() {
|
||||
return count;
|
||||
}
|
||||
|
||||
// Helper function to score a row (prioritize rows with more complete data)
|
||||
function scoreRow(data) {
|
||||
let score = 0;
|
||||
|
||||
// Ticker is mandatory and gets highest weight
|
||||
if (data.ticker) score += 100;
|
||||
|
||||
// Sector and exchange are nice-to-have
|
||||
if (data.sector) score += 10;
|
||||
if (data.exchange) score += 10;
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
// Extract data from all rows and find the one with most complete data
|
||||
let bestRow = null;
|
||||
let maxFieldCount = -1;
|
||||
let maxScore = -1;
|
||||
let rowIndex = 0;
|
||||
|
||||
for (const row of allRows) {
|
||||
const data = extractRowData(row);
|
||||
const fieldCount = countValidFields(data);
|
||||
const score = scoreRow(data);
|
||||
|
||||
// Select row with most valid data, or first row if tied
|
||||
if (fieldCount > maxFieldCount) {
|
||||
// Select row with highest score (most complete data)
|
||||
// If tied, first row wins
|
||||
if (score > maxScore) {
|
||||
bestRow = data;
|
||||
maxFieldCount = fieldCount;
|
||||
maxScore = score;
|
||||
bestRow.rowIndex = rowIndex;
|
||||
bestRow.validFieldCount = fieldCount;
|
||||
bestRow.validFieldCount = countValidFields(data);
|
||||
bestRow.score = score;
|
||||
}
|
||||
|
||||
rowIndex++;
|
||||
@@ -126,6 +202,7 @@ var extractionResult = (function() {
|
||||
metadata: {
|
||||
selectedRowIndex: bestRow.rowIndex,
|
||||
validFieldCount: bestRow.validFieldCount,
|
||||
score: bestRow.score,
|
||||
totalRows: allRows.length
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user