updated yahoo company extraction js to get the most data rich row
This commit is contained in:
@@ -45,33 +45,93 @@ var extractionResult = (function() {
|
|||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to check if value is valid (not empty, not -, not N/A)
|
// Helper function to check if a cell actually contains data
|
||||||
function isValidValue(value) {
|
// Multiple indicators are used to determine if data is present
|
||||||
if (!value) return false;
|
function hasValidData(cellElement) {
|
||||||
const normalized = value.trim().toLowerCase();
|
if (!cellElement) return false;
|
||||||
return normalized !== '' && normalized !== '-' && normalized !== 'n/a';
|
|
||||||
|
// Indicator 1: Check if the cell contains a <p> tag (Yahoo uses this for "no data")
|
||||||
|
const pTag = cellElement.querySelector('p');
|
||||||
|
if (pTag) return false;
|
||||||
|
|
||||||
|
// Indicator 2: Check the direct child structure
|
||||||
|
// Valid data cells have: td > span > div or td > span > div > a
|
||||||
|
// Invalid data cells have: td > span > p
|
||||||
|
const span = cellElement.querySelector('span');
|
||||||
|
if (span) {
|
||||||
|
const directChildren = Array.from(span.children);
|
||||||
|
// If the only or first child is a <p>, it's likely "no data"
|
||||||
|
if (directChildren.length > 0 && directChildren[0].tagName === 'P') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indicator 3: Check text content
|
||||||
|
const text = extractText(cellElement);
|
||||||
|
if (!text) return false;
|
||||||
|
const normalized = text.toLowerCase().trim();
|
||||||
|
|
||||||
|
// Common "no data" indicators
|
||||||
|
const noDataIndicators = [
|
||||||
|
'-',
|
||||||
|
'n/a',
|
||||||
|
'na',
|
||||||
|
'none',
|
||||||
|
'not available',
|
||||||
|
'no data',
|
||||||
|
'--',
|
||||||
|
'—', // em dash
|
||||||
|
'–', // en dash
|
||||||
|
];
|
||||||
|
|
||||||
|
if (noDataIndicators.includes(normalized)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indicator 4: Check for common CSS classes that indicate empty state
|
||||||
|
const classIndicators = ['empty', 'no-data', 'na', 'null', 'undefined'];
|
||||||
|
const classList = cellElement.className || '';
|
||||||
|
for (const indicator of classIndicators) {
|
||||||
|
if (classList.includes(indicator)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indicator 5: Check if cell has an anchor tag (usually indicates real data)
|
||||||
|
const hasLink = cellElement.querySelector('a') !== null;
|
||||||
|
|
||||||
|
// Indicator 6: Check if there's actual substantial content
|
||||||
|
// If text is very short (1-2 chars) and not alphanumeric, it's likely not real data
|
||||||
|
if (text.length <= 2 && !/[a-zA-Z0-9]/.test(text)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we passed all checks, consider it valid data
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to extract and normalize data from a cell
|
||||||
|
function extractCellData(cellElement) {
|
||||||
|
if (!cellElement) return null;
|
||||||
|
if (!hasValidData(cellElement)) return null;
|
||||||
|
|
||||||
|
const text = extractText(cellElement);
|
||||||
|
return text || null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to extract and normalize data from a row
|
// Helper function to extract and normalize data from a row
|
||||||
function extractRowData(row) {
|
function extractRowData(row) {
|
||||||
// Extract ticker from column 1 (td:nth-child(1) > span > div > a)
|
// Extract ticker from column 1 (td:nth-child(1))
|
||||||
const tickerElement = row.querySelector('td:nth-child(1) > span > div > a') ||
|
const tickerCell = row.querySelector('td:nth-child(1)');
|
||||||
row.querySelector('td:nth-child(1)');
|
const ticker = extractCellData(tickerCell);
|
||||||
const tickerRaw = extractText(tickerElement);
|
|
||||||
const ticker = isValidValue(tickerRaw) ? tickerRaw : null;
|
|
||||||
|
|
||||||
// Extract sector from column 4 (td:nth-child(4) > span > div > a or td:nth-child(4) > span > div)
|
// Extract sector from column 4 (td:nth-child(4))
|
||||||
const sectorElement = row.querySelector('td:nth-child(4) > span > div > a') ||
|
const sectorCell = row.querySelector('td:nth-child(4)');
|
||||||
row.querySelector('td:nth-child(4) > span > div') ||
|
const sector = extractCellData(sectorCell);
|
||||||
row.querySelector('td:nth-child(4)');
|
|
||||||
const sectorRaw = extractText(sectorElement);
|
|
||||||
const sector = isValidValue(sectorRaw) ? sectorRaw : null;
|
|
||||||
|
|
||||||
// Extract exchange from column 6 (td:nth-child(6) > span > div)
|
// Extract exchange from column 6 (td:nth-child(6))
|
||||||
const exchangeElement = row.querySelector('td:nth-child(6) > span > div') ||
|
const exchangeCell = row.querySelector('td:nth-child(6)');
|
||||||
row.querySelector('td:nth-child(6)');
|
const exchange = extractCellData(exchangeCell);
|
||||||
const exchangeRaw = extractText(exchangeElement);
|
|
||||||
const exchange = isValidValue(exchangeRaw) ? exchangeRaw : null;
|
|
||||||
|
|
||||||
return { ticker, sector, exchange };
|
return { ticker, sector, exchange };
|
||||||
}
|
}
|
||||||
@@ -85,21 +145,37 @@ var extractionResult = (function() {
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper function to score a row (prioritize rows with more complete data)
|
||||||
|
function scoreRow(data) {
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
// Ticker is mandatory and gets highest weight
|
||||||
|
if (data.ticker) score += 100;
|
||||||
|
|
||||||
|
// Sector and exchange are nice-to-have
|
||||||
|
if (data.sector) score += 10;
|
||||||
|
if (data.exchange) score += 10;
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
// Extract data from all rows and find the one with most complete data
|
// Extract data from all rows and find the one with most complete data
|
||||||
let bestRow = null;
|
let bestRow = null;
|
||||||
let maxFieldCount = -1;
|
let maxScore = -1;
|
||||||
let rowIndex = 0;
|
let rowIndex = 0;
|
||||||
|
|
||||||
for (const row of allRows) {
|
for (const row of allRows) {
|
||||||
const data = extractRowData(row);
|
const data = extractRowData(row);
|
||||||
const fieldCount = countValidFields(data);
|
const score = scoreRow(data);
|
||||||
|
|
||||||
// Select row with most valid data, or first row if tied
|
// Select row with highest score (most complete data)
|
||||||
if (fieldCount > maxFieldCount) {
|
// If tied, first row wins
|
||||||
|
if (score > maxScore) {
|
||||||
bestRow = data;
|
bestRow = data;
|
||||||
maxFieldCount = fieldCount;
|
maxScore = score;
|
||||||
bestRow.rowIndex = rowIndex;
|
bestRow.rowIndex = rowIndex;
|
||||||
bestRow.validFieldCount = fieldCount;
|
bestRow.validFieldCount = countValidFields(data);
|
||||||
|
bestRow.score = score;
|
||||||
}
|
}
|
||||||
|
|
||||||
rowIndex++;
|
rowIndex++;
|
||||||
@@ -126,6 +202,7 @@ var extractionResult = (function() {
|
|||||||
metadata: {
|
metadata: {
|
||||||
selectedRowIndex: bestRow.rowIndex,
|
selectedRowIndex: bestRow.rowIndex,
|
||||||
validFieldCount: bestRow.validFieldCount,
|
validFieldCount: bestRow.validFieldCount,
|
||||||
|
score: bestRow.score,
|
||||||
totalRows: allRows.length
|
totalRows: allRows.length
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user