3535f0cb24
* Updated help text * Refactored a lot of the scrape code into a SiteScrape helper
152 lines
4.4 KiB
JavaScript
152 lines
4.4 KiB
JavaScript
let SiteScrape = module.exports = {}
|
|
|
|
//
|
|
// $ = the cheerio scrape object
|
|
//
|
|
|
|
const removeWhitespace = /\s+/g
|
|
|
|
const commonWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
|
|
'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
|
|
'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
|
|
'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
|
|
'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
|
|
'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']
|
|
|
|
SiteScrape.getTitle = ($) => {
|
|
|
|
let title = $('title').text().replace(removeWhitespace, " ")
|
|
return title
|
|
|
|
}
|
|
|
|
//Finds all urls in text, removes duplicates, makes sure they have https://
|
|
SiteScrape.getCleanUrls = (textBlock) => {
|
|
//Find all URLs in text
|
|
//@TODO - Use the process text library for this function
|
|
const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
|
|
let allUrls = textBlock.match(urlPattern)
|
|
|
|
if(allUrls == null){
|
|
return []
|
|
}
|
|
|
|
//Every URL needs HTTPS!!!
|
|
let foundUrls = []
|
|
allUrls.forEach( (item, index) => {
|
|
//add protocol if it is missing
|
|
if(item.indexOf('https://') == -1 && item.indexOf('http://') == -1){
|
|
allUrls[index] = 'https://'+item
|
|
}
|
|
//convert http to https
|
|
if(item.indexOf('http://') >= 0){
|
|
allUrls[index] = item.replace('http://','https://')
|
|
}
|
|
})
|
|
|
|
//Remove all duplicates
|
|
foundUrls = [...new Set(allUrls)]
|
|
|
|
return foundUrls
|
|
}
|
|
|
|
//Site hostname with https:// eg: https://www.google.com
|
|
SiteScrape.getHostName = (url) => {
|
|
|
|
var hostname = 'https://'+(new URL(url)).hostname;
|
|
console.log('hostname', hostname)
|
|
return hostname
|
|
}
|
|
|
|
// URL for image that can be downloaded to represent website
|
|
SiteScrape.getDisplayImage = ($, url) => {
|
|
|
|
const hostname = SiteScrape.getHostName(url)
|
|
|
|
let metaImg = $('meta[property="og:image"]')
|
|
let shortcutIcon = $('link[rel="shortcut icon"]')
|
|
let favicon = $('link[rel="icon"]')
|
|
let randomImg = $('img')
|
|
|
|
console.log('----')
|
|
|
|
//Scrape metadata for page image
|
|
//Grab the first random image we find
|
|
if(randomImg && randomImg[0] && randomImg[0].attribs){
|
|
thumbnail = hostname + randomImg[0].attribs.src
|
|
console.log('random img '+thumbnail)
|
|
}
|
|
//Grab the favicon of the site
|
|
if(favicon && favicon[0] && favicon[0].attribs){
|
|
thumbnail = hostname + favicon[0].attribs.href
|
|
console.log('favicon '+thumbnail)
|
|
}
|
|
//Grab the shortcut icon
|
|
if(shortcutIcon && shortcutIcon[0] && shortcutIcon[0].attribs){
|
|
thumbnail = hostname + shortcutIcon[0].attribs.href
|
|
console.log('shortcut '+thumbnail)
|
|
}
|
|
//Grab the presentation image for the site
|
|
if(metaImg && metaImg[0] && metaImg[0].attribs){
|
|
thumbnail = metaImg[0].attribs.content
|
|
console.log('ogImg '+thumbnail)
|
|
}
|
|
|
|
console.log('-----')
|
|
return thumbnail
|
|
}
|
|
|
|
// Get all the site text and parse out the words that appear most
|
|
SiteScrape.getKeywords = ($) => {
|
|
|
|
let majorContent = ''
|
|
|
|
majorContent += $('[class*=content]').text()
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
.substring(0,3000) //Limit to 3000 characters
|
|
.toLowerCase()
|
|
|
|
//Count frequency of each word in scraped text
|
|
let frequency = {}
|
|
majorContent.split(' ').forEach(word => {
|
|
if(commonWords.includes(word)){
|
|
return //Exclude certain words
|
|
}
|
|
if(!frequency[word]){
|
|
frequency[word] = 0
|
|
}
|
|
frequency[word]++
|
|
})
|
|
|
|
//Create a sortable array
|
|
var sortable = [];
|
|
for (var index in frequency) {
|
|
if(frequency[index] > 1){
|
|
sortable.push([index, frequency[index]]);
|
|
}
|
|
}
|
|
|
|
//Sort them by most used words in the list
|
|
sortable.sort(function(a, b) {
|
|
return b[1] - a[1];
|
|
});
|
|
|
|
let finalWords = []
|
|
for(let i=0; i<5; i++){
|
|
if(sortable[i] && sortable[i][0]){
|
|
finalWords.push(sortable[i][0])
|
|
}
|
|
}
|
|
|
|
if(finalWords.length > 0){
|
|
return 'Keywords: ' + finalWords.join(', ')
|
|
}
|
|
return ''
|
|
}
|
|
|
|
SiteScrape.getMainText = ($) => {}
|
|
|
|
|
|
|