diff options
Diffstat (limited to 'chromium/third_party/readability/js/readability.js')
-rw-r--r-- | chromium/third_party/readability/js/readability.js | 231 |
1 files changed, 114 insertions, 117 deletions
diff --git a/chromium/third_party/readability/js/readability.js b/chromium/third_party/readability/js/readability.js index 68a0286497a..4308093edbb 100644 --- a/chromium/third_party/readability/js/readability.js +++ b/chromium/third_party/readability/js/readability.js @@ -1,14 +1,20 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Local modifications to this file are described in the README.chromium +// file. var dbg = (typeof console !== 'undefined') ? function(s) { console.log("Readability: " + s); } : function() {}; /* - * Readability. An Arc90 Lab Experiment. + * Readability. An Arc90 Lab Experiment. * Website: http://lab.arc90.com/experiments/readability * Source: http://code.google.com/p/arc90labs-readability * - * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. + * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. * * Copyright (c) 2010 Arc90 Inc * Readability is licensed under the Apache License, Version 2.0. @@ -20,6 +26,7 @@ var readability = { distilledHTML: '', distilledArticleContent: null, + nextPageLink: '', version: '1.7.1', iframeLoads: 0, @@ -41,7 +48,7 @@ var readability = { maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ - + /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. @@ -66,7 +73,7 @@ var readability = { /** * Runs readability. - * + * * Workflow: * 1. Prep the document by removing script tags, css, etc. * 2. Build readability's DOM tree. @@ -86,8 +93,11 @@ var readability = { readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; /* Pull out any possible next page link first */ - var nextPageLink = readability.findNextPageLink(document.body); - + readability.nextPageLink = readability.findNextPageLink(document.body); + + /* We handle processing of nextPage from C++ set nextPageLink to null */ + var nextPageLink = null; + readability.prepDocument(); /* Build readability's DOM tree */ @@ -152,12 +162,8 @@ var readability = { window.scrollTo(0, 0); - // TODO(bengr): Remove this assignment of null to nextPageLink when - // the processing of the next page link is safe. - nextPageLink = null; - if (nextPageLink) { - /** + /** * Append any additional pages after a small timeout so that people * can start reading without having to wait for this to finish processing. **/ @@ -179,16 +185,16 @@ var readability = { var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); if(readability.reversePageScroll) { - readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); + readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); } else { - readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); + readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); } - + return false; } }; - + document.onkeyup = function(e) { var code = (window.event) ? event.keyCode : e.keyCode; if (code === 16) { @@ -200,7 +206,7 @@ var readability = { /** * Run any post-process modifications to article content as necessary. - * + * * @param Element * @return void **/ @@ -226,7 +232,7 @@ var readability = { for(var i=0, il = images.length; i < il; i+=1) { var image = images[i]; - + if(image.offsetWidth > imageWidthThreshold) { image.className += " blockImage"; } @@ -242,7 +248,7 @@ var readability = { var articleTools = document.createElement("DIV"); articleTools.id = "readTools"; - articleTools.innerHTML = + articleTools.innerHTML = "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"; @@ -259,13 +265,13 @@ var readability = { function sanitizeText() { return text.replace(/@\w+/, ""); } - + function countMatches(match) { var matches = text.match(new RegExp(match, "g")); - return matches !== null ? matches.length : 0; + return matches !== null ? matches.length : 0; } - - function isRTL() { + + function isRTL() { var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); @@ -289,15 +295,15 @@ var readability = { try { curTitle = origTitle = document.title; if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ - curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); + curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); } } catch(e) {} - + if(curTitle.match(/ [\|\-] /)) { curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); - + if(curTitle.split(' ').length < 3) { curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); } @@ -330,7 +336,7 @@ var readability = { /** * Prepare the HTML document for readability to scrape it. * This includes things like stripping javascript, CSS, and handling terrible markup. - * + * * @return void **/ prepDocument: function () { @@ -342,7 +348,7 @@ var readability = { { var body = document.createElement("body"); try { - document.body = body; + document.body = body; } catch(e) { document.documentElement.appendChild(body); @@ -374,11 +380,11 @@ var readability = { biggestFrameSize = frameSize; readability.biggestFrame = frames[frameIndex]; } - + if(canAccessFrame && frameSize > bestFrameSize) { readability.frameHack = true; - + bestFrame = frames[frameIndex]; bestFrameSize = frameSize; } @@ -390,7 +396,7 @@ var readability = { readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody); newBody.style.overflow = 'scroll'; document.body = newBody; - + var frameset = document.getElementsByTagName('frameset')[0]; if(frameset) { frameset.parentNode.removeChild(frameset); } @@ -455,7 +461,7 @@ var readability = { var imgCount = articleParagraphs[i].getElementsByTagName('img').length; var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; var objectCount = articleParagraphs[i].getElementsByTagName('object').length; - + if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') { articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); } @@ -468,7 +474,7 @@ var readability = { dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e); } }, - + /** * Initialize a node with the readability object. Also checks the * className/id for special names to add to its score. @@ -477,7 +483,7 @@ var readability = { * @return void **/ initializeNode: function (node) { - node.readability = {"contentScore": 0}; + node.readability = {"contentScore": 0}; switch(node.tagName) { case 'DIV': @@ -489,7 +495,7 @@ var readability = { case 'BLOCKQUOTE': node.readability.contentScore += 3; break; - + case 'ADDRESS': case 'OL': case 'UL': @@ -511,10 +517,10 @@ var readability = { node.readability.contentScore -= 5; break; } - + node.readability.contentScore += readability.getClassWeight(node); }, - + /*** * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. @@ -525,7 +531,7 @@ var readability = { grabArticle: function (pageToClone) { var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS), isPaging = (page !== null) ? true: false; - + var page = null; // Never work on the actual page. if (isPaging) { @@ -533,7 +539,7 @@ var readability = { } else { page = pageToClone.cloneNode(true); } - + var allElements = page.getElementsByTagName('*'); /** @@ -561,7 +567,7 @@ var readability = { node.parentNode.removeChild(node); nodeIndex-=1; continue; - } + } } if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") { @@ -598,7 +604,7 @@ var readability = { } } } - } + } } /** @@ -640,15 +646,15 @@ var readability = { /* Add points for any commas within this paragraph */ contentScore += innerText.split(',').length; - + /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ contentScore += Math.min(Math.floor(innerText.length / 100), 3); - + /* Add the score to the parent. The grandparent gets half. */ parentNode.readability.contentScore += contentScore; if(grandParentNode) { - grandParentNode.readability.contentScore += contentScore/2; + grandParentNode.readability.contentScore += contentScore/2; } } @@ -725,12 +731,12 @@ var readability = { { append = true; } - + if(siblingNode.nodeName === "P") { var linkDensity = readability.getLinkDensity(siblingNode); var nodeContent = readability.getInnerText(siblingNode); var nodeLength = nodeContent.length; - + if(nodeLength > 80 && linkDensity < 0.25) { append = true; @@ -747,7 +753,7 @@ var readability = { var nodeToAppend = null; if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ - + dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); nodeToAppend = document.createElement("DIV"); try { @@ -765,7 +771,7 @@ var readability = { s-=1; sl-=1; } - + /* To ensure a node does not interfere with readability styles, remove its classnames */ nodeToAppend.className = ""; @@ -779,15 +785,15 @@ var readability = { **/ readability.distilledArticleContent = articleContent.cloneNode(true); //readability.prepArticle(articleContent); - + if (readability.curPageNum === 1) { var newNode = document.createElement('div'); newNode.id = "readability-page-1"; newNode.setAttribute("class", "page"); readability.moveNodeInnards(articleContent, newNode); articleContent.appendChild(newNode); - } - + } + /** * Now that we've gone through the full algorithm, check to see if we got any meaningful content. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher @@ -813,7 +819,7 @@ var readability = { return articleContent; }, - + /** * Removes script tags from the document. * @@ -828,12 +834,12 @@ var readability = { scripts[i].nodeValue=""; scripts[i].removeAttribute('src'); if (scripts[i].parentNode) { - scripts[i].parentNode.removeChild(scripts[i]); + scripts[i].parentNode.removeChild(scripts[i]); } } } }, - + /** * Get the inner text of a node - cross browser compatibly. * This also strips out any excess whitespace to be found. @@ -896,18 +902,18 @@ var readability = { if ( cur.nodeType === 1 ) { // Remove style attribute(s) : if(cur.className !== "readability-styled") { - cur.removeAttribute("style"); + cur.removeAttribute("style"); } readability.cleanStyles( cur ); } cur = cur.nextSibling; - } + } }, - + /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. - * + * * @param Element * @return number (float) **/ @@ -918,11 +924,11 @@ var readability = { for(var i=0, il=links.length; i<il;i+=1) { linkLength += readability.getInnerText(links[i]).length; - } + } return linkLength / textLength; }, - + /** * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. * @@ -944,10 +950,10 @@ var readability = { /* If the type isn't alpha-only, it's probably not actually a file extension. */ if(!possibleType.match(/[^a-zA-Z]/)) { - segment = segment.split(".")[0]; + segment = segment.split(".")[0]; } } - + /** * EW-CMS specific segment replacement. Ugly. * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html @@ -968,7 +974,7 @@ var readability = { if (i < 2 && segment.match(/^\d{1,2}$/)) { del = true; } - + /* If this is the first segment and it's just "index", remove it. */ if(i === 0 && segment.toLowerCase() === "index") { del = true; @@ -992,7 +998,7 @@ var readability = { /** * Look for any paging links that may occur within the document. - * + * * @param body * @return object (array) **/ @@ -1008,7 +1014,7 @@ var readability = { * * Also possible: levenshtein distance? longest common subsequence? * - * After we do that, assign each page a score, and + * After we do that, assign each page a score, and **/ for(var i = 0, il = allLinks.length; i < il; i+=1) { var link = allLinks[i], @@ -1018,12 +1024,12 @@ var readability = { if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) { continue; } - + /* If it's on a different domain, skip it. */ if(window.location.host !== linkHref.split(/\/+/g)[1]) { continue; } - + var linkText = readability.getInnerText(link); /* If the linkText looks like it's not the next page, skip it. */ @@ -1036,9 +1042,9 @@ var readability = { if(!linkHrefLeftover.match(/\d/)) { continue; } - + if(!(linkHref in possiblePages)) { - possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; + possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; } else { possiblePages[linkHref].linkText += ' | ' + linkText; } @@ -1060,7 +1066,7 @@ var readability = { if(linkData.match(/pag(e|ing|inat)/i)) { linkObj.score += 25; } - if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, + if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ if(!linkObj.linkText.match(readability.regexps.nextLink)) { linkObj.score -= 65; @@ -1087,10 +1093,10 @@ var readability = { /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */ if(!parentNodeClassAndId.match(readability.regexps.positive)) { linkObj.score -= 25; - negativeNodeMatch = true; + negativeNodeMatch = true; } } - + parentNode = parentNode.parentNode; } @@ -1152,7 +1158,7 @@ var readability = { dbg('NEXT PAGE IS ' + nextHref); readability.parsedPages[nextHref] = true; - return nextHref; + return nextHref; } else { return null; @@ -1204,7 +1210,7 @@ var readability = { if (typeof options === 'undefined') { options = {}; } request.onreadystatechange = respondToReadyState; - + request.open('get', url, true); request.setRequestHeader('Accept', 'text/html'); @@ -1239,7 +1245,7 @@ var readability = { articlePage.appendChild(linkDiv); return; } - + /** * Now that we've built the article page DOM element, get the page content * asynchronously and load the cleaned content into the div we created for it. @@ -1257,7 +1263,7 @@ var readability = { return; } else { readability.pageETags[eTag] = 1; - } + } } // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. @@ -1308,7 +1314,7 @@ var readability = { } } } - + readability.removeScripts(content); readability.moveNodeInnards(content, thisPage); @@ -1330,9 +1336,9 @@ var readability = { }); }(nextPageLink, articlePage)); }, - + /** - * Get an elements class/id weight. Uses regular expressions to tell if this + * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. * * @param Element @@ -1382,7 +1388,7 @@ var readability = { var allElements = e.getElementsByTagName('*'); while (i < allElements.length) { readability.deleteExtraBreaks(allElements[i]); - i++; + i++; } }, @@ -1397,7 +1403,7 @@ var readability = { clean: function (e, tag) { var targetList = e.getElementsByTagName( tag ); var isEmbed = (tag === 'object' || tag === 'embed'); - + for (var y=targetList.length-1; y >= 0; y-=1) { /* Allow youtube and vimeo videos through as people usually want to see those. */ if(isEmbed) { @@ -1405,7 +1411,7 @@ var readability = { for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { attributeValues += targetList[y].attributes[i].value + '|'; } - + /* First, check the elements attributes to see if any of them contain youtube or vimeo */ if (attributeValues.search(readability.regexps.videos) !== -1) { continue; @@ -1415,13 +1421,13 @@ var readability = { if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) { continue; } - + } targetList[y].parentNode.removeChild(targetList[y]); } }, - + /** * Clean an element of all tags of type "tag" if they look fishy. * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. @@ -1446,7 +1452,7 @@ var readability = { for (var i=curTagsLength-1; i >= 0; i-=1) { var weight = readability.getClassWeight(tagsList[i]); var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; - + dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); if(weight+contentScore < 0) @@ -1467,7 +1473,7 @@ var readability = { var embeds = tagsList[i].getElementsByTagName("embed"); for(var ei=0,il=embeds.length; ei < il; ei+=1) { if (embeds[ei].src.search(readability.regexps.videos) === -1) { - embedCount+=1; + embedCount+=1; } } @@ -1480,7 +1486,7 @@ var readability = { } else if(li > p && tag !== "ul" && tag !== "ol") { toRemove = true; } else if( input > Math.floor(p/3) ) { - toRemove = true; + toRemove = true; } else if(contentLength < 25 && (img === 0 || img > 2) ) { toRemove = true; } else if(weight < 25 && linkDensity > 0.2) { @@ -1522,7 +1528,7 @@ var readability = { addFlag: function(flag) { readability.flags = readability.flags | flag; }, - + removeFlag: function(flag) { readability.flags = readability.flags & ~flag; }, @@ -1591,7 +1597,7 @@ var readability = { } return ret; }, - + // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a // <P> node, and makes all next siblings of that pair children of <P>, up // until the next pair of <BR> nodes is reached. @@ -1600,7 +1606,7 @@ var readability = { var second = readability.isMultipleBr(node, true); if (!second) { return; - } + } // Make all next siblings of the second BR into children of a P. var p = document.createElement('p'); var curr = second.nextSibling; @@ -1613,7 +1619,7 @@ var readability = { curr = next; } var ret = curr; - + // Remove all nodes between the first and second BR. curr = node.nextSibling; while (curr && curr != second) { @@ -1625,10 +1631,10 @@ var readability = { second.parentNode.removeChild(second); // Replace the first BR with the P. node.parentNode.replaceChild(p, node); - + return ret; }, - + // Returns true if the NodeList contains a double <BR>. hasDoubleBr: function(nodeList) { for (var i = 0; i < nodeList.length; nodeList++) { @@ -1637,8 +1643,8 @@ var readability = { } } return false; - }, - + }, + // Replaces double <BR> tags with <P> tags. replaceDoubleBrsWithPs: function(node) { var allElements = node.getElementsByTagName('BR'); @@ -1652,8 +1658,8 @@ var readability = { allElements = document.body.getElementsByTagName('BR'); } }, - - + + // Replaces a BR and the whitespace that follows it with a P. replaceBrWithP: function(node) { if (!readability.isBrNode(node)) { @@ -1673,7 +1679,7 @@ var readability = { node.parentNode.replaceChild(p, node); return curr; }, - + // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag // children of the <P>. replaceBrsWithPs: function(node) { @@ -1687,27 +1693,27 @@ var readability = { allElements = document.body.getElementsByTagName('BR'); } }, - + // Replaces any tag with any other tag. replaceTagsWithTags: function(node, srcTag, destTag) { var allElements = node.getElementsByTagName(srcTag); for (var i = 0; i < allElements.length; i++) { var dest = document.createElement(destTag); readability.moveNodeInnards(allElements[i], dest); - node.replaceNode(dest, allElements[i]); + allElements[i].parentNode.replaceChild(dest, allElements[i]); } }, - + // Replaces all <noscript> tags with <p> tags. replaceNoscriptsWithPs: function(node) { readability.replaceTagsWithTags(node, 'noscript', 'p'); }, - + // Replaces all <font> tags with <span> tags. replaceFontsWithSpans: function(node) { readability.replaceTagsWithTags(node, 'font', 'span'); }, - + // Returns a list of image URLs in the distilled article. getImages : function() { var images = document.getElementsByTagName('img'); @@ -1719,23 +1725,14 @@ var readability = { } return result; }, - + // Returns the distilled article HTML from the page(s). getDistilledArticleHTML : function() { return readability.distilledHTML; + }, + + // Returns the next page of this article. + getNextPageLink : function() { + return readability.nextPageLink; } }; - -// Extracts long-form content from a page and returns and array where the first -// element is the article title, the second element is HTML containing the -// long-form content, and remaining elements are URLs for images referenced by -// that HTML. Each <img> tag in the HTML has an id field set to k - 2, which -// corresponds to a URL listed at index k in the array returned. -(function () { - readability.init(); - var result = new Array(2); - result[0] = readability.getArticleTitle(); - result[1] = readability.getDistilledArticleHTML(); - return result.concat(readability.getImages()); -}()) - |