let SiteScrape = module.exports = {} // // $ = the cheerio scrape object // const removeWhitespace = /\s+/g const commonWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want', 'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old', 'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on', 'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her', 'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after', 'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its'] SiteScrape.getTitle = ($) => { let title = $('title').text().replace(removeWhitespace, " ") return title } //Finds all urls in text, removes duplicates, makes sure they have https:// SiteScrape.getCleanUrls = (textBlock) => { //Find all URLs in text //@TODO - Use the process text library for this function const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm let allUrls = textBlock.match(urlPattern) if(allUrls == null){ return [] } //Every URL needs HTTPS!!! let foundUrls = [] allUrls.forEach( (item, index) => { //add protocol if it is missing if(item.indexOf('https://') == -1 && item.indexOf('http://') == -1){ allUrls[index] = 'https://'+item } //convert http to https if(item.indexOf('http://') >= 0){ allUrls[index] = item.replace('http://','https://') } }) //Remove all duplicates foundUrls = [...new Set(allUrls)] return foundUrls } //Site hostname with https:// eg: https://www.google.com SiteScrape.getHostName = (url) => { var hostname = 'https://'+(new URL(url)).hostname; // console.log('hostname', hostname) return hostname } // URL for image that can be downloaded to represent website SiteScrape.getDisplayImage = ($, url) => { const hostname = SiteScrape.getHostName(url) let metaImg = $('[property="og:image"]') let shortcutIcon = $('[rel="shortcut icon"]') let favicon = $('[rel="icon"]') let randomImg = $('img') //Set of images we may want gathered from various places in source let imagesWeWant = [] let thumbnail = '' //Scrape metadata for page image if(randomImg && randomImg.length > 0){ let imgSrcs = [] for (let i = 0; i < randomImg.length; i++) { imgSrcs.push( randomImg[i].attribs.src ) } const half = Math.ceil(imgSrcs.length / 2) imagesWeWant = [...imgSrcs.slice(-half), ...imgSrcs.slice(0,half) ] } //Grab the shortcut icon if(favicon && favicon[0] && favicon[0].attribs){ imagesWeWant.push(favicon[0].attribs.href) } //Grab the shortcut icon if(shortcutIcon && shortcutIcon[0] && shortcutIcon[0].attribs){ imagesWeWant.push(shortcutIcon[0].attribs.href) } //Grab the presentation image for the site if(metaImg && metaImg[0] && metaImg[0].attribs){ imagesWeWant.unshift(metaImg[0].attribs.content) } // console.log(imagesWeWant) //Remove everything that isn't an accepted file format for (let i = imagesWeWant.length - 1; i >= 0; i--) { let img = String(imagesWeWant[i]) if( !img.includes('.jpg') && !img.includes('.jpeg') && !img.includes('.png') && !img.includes('.gif') ){ imagesWeWant.splice(i,1) } } //Find if we have absolute thumbnails or not let foundAbsolute = false for (let i = imagesWeWant.length - 1; i >= 0; i--) { let img = imagesWeWant[i] //Add host name if its not included if(String(img).includes('//') || String(img).includes('http')){ foundAbsolute = true break } } //Go through all found images. Grab the one closest to the top. Closer is better for (let i = imagesWeWant.length - 1; i >= 0; i--) { let img = imagesWeWant[i] if(!String(img).includes('//') && foundAbsolute){ continue; } //Only add host to images if no absolute images were found if(!String(img).includes('//') ){ if(img.indexOf('/') != 0){ img = '/' + img } img = hostname + img } if(img.indexOf('//') == 0){ img = 'https:' + img //Scrape breaks without protocol } thumbnail = img } return thumbnail } // Get all the site text and parse out the words that appear most SiteScrape.getKeywords = ($) => { let majorContent = '' majorContent += $('[class*=content]').text() .replace(removeWhitespace, " ") //Remove all whitespace .replace(/\W\s/g, '') //Remove all non alphanumeric characters .substring(0,3000) //Limit to 3000 characters .toLowerCase() //Count frequency of each word in scraped text let frequency = {} majorContent.split(' ').forEach(word => { if(commonWords.includes(word)){ return //Exclude certain words } if(!frequency[word]){ frequency[word] = 0 } frequency[word]++ }) //Create a sortable array var sortable = []; for (var index in frequency) { if(frequency[index] > 1){ sortable.push([index, frequency[index]]); } } //Sort them by most used words in the list sortable.sort(function(a, b) { return b[1] - a[1]; }); let finalWords = [] for(let i=0; i<5; i++){ if(sortable[i] && sortable[i][0]){ finalWords.push(sortable[i][0]) } } if(finalWords.length > 0){ return 'Keywords: ' + finalWords.join(', ') } return '' } SiteScrape.getMainText = ($) => {}