let SiteScrape = module.exports = {} // // $ = the cheerio scrape object // const removeWhitespace = /\s+/g const commonWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want', 'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old', 'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on', 'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her', 'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after', 'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its'] SiteScrape.getTitle = ($) => { let title = $('title').text().replace(removeWhitespace, " ") return title } //Finds all urls in text, removes duplicates, makes sure they have https:// SiteScrape.getCleanUrls = (textBlock) => { //Find all URLs in text //@TODO - Use the process text library for this function const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm let allUrls = textBlock.match(urlPattern) if(allUrls == null){ return [] } //Every URL needs HTTPS!!! let foundUrls = [] allUrls.forEach( (item, index) => { //add protocol if it is missing if(item.indexOf('https://') == -1 && item.indexOf('http://') == -1){ allUrls[index] = 'https://'+item } //convert http to https if(item.indexOf('http://') >= 0){ allUrls[index] = item.replace('http://','https://') } }) //Remove all duplicates foundUrls = [...new Set(allUrls)] return foundUrls } //Site hostname with https:// eg: https://www.google.com SiteScrape.getHostName = (url) => { var hostname = 'https://'+(new URL(url)).hostname; console.log('hostname', hostname) return hostname } // URL for image that can be downloaded to represent website SiteScrape.getDisplayImage = ($, url) => { const hostname = SiteScrape.getHostName(url) let metaImg = $('meta[property="og:image"]') let shortcutIcon = $('link[rel="shortcut icon"]') let favicon = $('link[rel="icon"]') let randomImg = $('img') console.log('----') //Scrape metadata for page image //Grab the first random image we find if(randomImg && randomImg[0] && randomImg[0].attribs){ thumbnail = hostname + randomImg[0].attribs.src console.log('random img '+thumbnail) } //Grab the favicon of the site if(favicon && favicon[0] && favicon[0].attribs){ thumbnail = hostname + favicon[0].attribs.href console.log('favicon '+thumbnail) } //Grab the shortcut icon if(shortcutIcon && shortcutIcon[0] && shortcutIcon[0].attribs){ thumbnail = hostname + shortcutIcon[0].attribs.href console.log('shortcut '+thumbnail) } //Grab the presentation image for the site if(metaImg && metaImg[0] && metaImg[0].attribs){ thumbnail = metaImg[0].attribs.content console.log('ogImg '+thumbnail) } console.log('-----') return thumbnail } // Get all the site text and parse out the words that appear most SiteScrape.getKeywords = ($) => { let majorContent = '' majorContent += $('[class*=content]').text() .replace(removeWhitespace, " ") //Remove all whitespace .replace(/\W\s/g, '') //Remove all non alphanumeric characters .substring(0,3000) //Limit to 3000 characters .toLowerCase() //Count frequency of each word in scraped text let frequency = {} majorContent.split(' ').forEach(word => { if(commonWords.includes(word)){ return //Exclude certain words } if(!frequency[word]){ frequency[word] = 0 } frequency[word]++ }) //Create a sortable array var sortable = []; for (var index in frequency) { if(frequency[index] > 1){ sortable.push([index, frequency[index]]); } } //Sort them by most used words in the list sortable.sort(function(a, b) { return b[1] - a[1]; }); let finalWords = [] for(let i=0; i<5; i++){ if(sortable[i] && sortable[i][0]){ finalWords.push(sortable[i][0]) } } if(finalWords.length > 0){ return 'Keywords: ' + finalWords.join(', ') } return '' } SiteScrape.getMainText = ($) => {}