From 5e81959322a672875cf1ba7fb1e99d2f44d0cb7e Mon Sep 17 00:00:00 2001 From: donpat1to Date: Fri, 19 Dec 2025 14:43:36 +0100 Subject: [PATCH] updated yahoo company extraction js to get the most data rich row --- src/corporate/yahoo_company_extraction.js | 131 +++++++++++++++++----- 1 file changed, 104 insertions(+), 27 deletions(-) diff --git a/src/corporate/yahoo_company_extraction.js b/src/corporate/yahoo_company_extraction.js index 082ce9f..0b72d81 100644 --- a/src/corporate/yahoo_company_extraction.js +++ b/src/corporate/yahoo_company_extraction.js @@ -45,33 +45,93 @@ var extractionResult = (function() { return text; } - // Helper function to check if value is valid (not empty, not -, not N/A) - function isValidValue(value) { - if (!value) return false; - const normalized = value.trim().toLowerCase(); - return normalized !== '' && normalized !== '-' && normalized !== 'n/a'; + // Helper function to check if a cell actually contains data + // Multiple indicators are used to determine if data is present + function hasValidData(cellElement) { + if (!cellElement) return false; + + // Indicator 1: Check if the cell contains a

tag (Yahoo uses this for "no data") + const pTag = cellElement.querySelector('p'); + if (pTag) return false; + + // Indicator 2: Check the direct child structure + // Valid data cells have: td > span > div or td > span > div > a + // Invalid data cells have: td > span > p + const span = cellElement.querySelector('span'); + if (span) { + const directChildren = Array.from(span.children); + // If the only or first child is a

, it's likely "no data" + if (directChildren.length > 0 && directChildren[0].tagName === 'P') { + return false; + } + } + + // Indicator 3: Check text content + const text = extractText(cellElement); + if (!text) return false; + const normalized = text.toLowerCase().trim(); + + // Common "no data" indicators + const noDataIndicators = [ + '-', + 'n/a', + 'na', + 'none', + 'not available', + 'no data', + '--', + '—', // em dash + '–', // en dash + ]; + + if (noDataIndicators.includes(normalized)) { + return false; + } + + // Indicator 4: Check for common CSS classes that indicate empty state + const classIndicators = ['empty', 'no-data', 'na', 'null', 'undefined']; + const classList = cellElement.className || ''; + for (const indicator of classIndicators) { + if (classList.includes(indicator)) { + return false; + } + } + + // Indicator 5: Check if cell has an anchor tag (usually indicates real data) + const hasLink = cellElement.querySelector('a') !== null; + + // Indicator 6: Check if there's actual substantial content + // If text is very short (1-2 chars) and not alphanumeric, it's likely not real data + if (text.length <= 2 && !/[a-zA-Z0-9]/.test(text)) { + return false; + } + + // If we passed all checks, consider it valid data + return true; + } + + // Helper function to extract and normalize data from a cell + function extractCellData(cellElement) { + if (!cellElement) return null; + if (!hasValidData(cellElement)) return null; + + const text = extractText(cellElement); + return text || null; } // Helper function to extract and normalize data from a row function extractRowData(row) { - // Extract ticker from column 1 (td:nth-child(1) > span > div > a) - const tickerElement = row.querySelector('td:nth-child(1) > span > div > a') || - row.querySelector('td:nth-child(1)'); - const tickerRaw = extractText(tickerElement); - const ticker = isValidValue(tickerRaw) ? tickerRaw : null; + // Extract ticker from column 1 (td:nth-child(1)) + const tickerCell = row.querySelector('td:nth-child(1)'); + const ticker = extractCellData(tickerCell); - // Extract sector from column 4 (td:nth-child(4) > span > div > a or td:nth-child(4) > span > div) - const sectorElement = row.querySelector('td:nth-child(4) > span > div > a') || - row.querySelector('td:nth-child(4) > span > div') || - row.querySelector('td:nth-child(4)'); - const sectorRaw = extractText(sectorElement); - const sector = isValidValue(sectorRaw) ? sectorRaw : null; + // Extract sector from column 4 (td:nth-child(4)) + const sectorCell = row.querySelector('td:nth-child(4)'); + const sector = extractCellData(sectorCell); - // Extract exchange from column 6 (td:nth-child(6) > span > div) - const exchangeElement = row.querySelector('td:nth-child(6) > span > div') || - row.querySelector('td:nth-child(6)'); - const exchangeRaw = extractText(exchangeElement); - const exchange = isValidValue(exchangeRaw) ? exchangeRaw : null; + // Extract exchange from column 6 (td:nth-child(6)) + const exchangeCell = row.querySelector('td:nth-child(6)'); + const exchange = extractCellData(exchangeCell); return { ticker, sector, exchange }; } @@ -85,21 +145,37 @@ var extractionResult = (function() { return count; } + // Helper function to score a row (prioritize rows with more complete data) + function scoreRow(data) { + let score = 0; + + // Ticker is mandatory and gets highest weight + if (data.ticker) score += 100; + + // Sector and exchange are nice-to-have + if (data.sector) score += 10; + if (data.exchange) score += 10; + + return score; + } + // Extract data from all rows and find the one with most complete data let bestRow = null; - let maxFieldCount = -1; + let maxScore = -1; let rowIndex = 0; for (const row of allRows) { const data = extractRowData(row); - const fieldCount = countValidFields(data); + const score = scoreRow(data); - // Select row with most valid data, or first row if tied - if (fieldCount > maxFieldCount) { + // Select row with highest score (most complete data) + // If tied, first row wins + if (score > maxScore) { bestRow = data; - maxFieldCount = fieldCount; + maxScore = score; bestRow.rowIndex = rowIndex; - bestRow.validFieldCount = fieldCount; + bestRow.validFieldCount = countValidFields(data); + bestRow.score = score; } rowIndex++; @@ -126,6 +202,7 @@ var extractionResult = (function() { metadata: { selectedRowIndex: bestRow.rowIndex, validFieldCount: bestRow.validFieldCount, + score: bestRow.score, totalRows: allRows.length } };