SolidScribe/server/helpers/SiteScrape.js

let SiteScrape = module.exports = {}

//
// $ = the cheerio scrape object
//

const removeWhitespace = /\s+/g

const commonWords = ['just','start','what','these','how', 'was', 'being','can','way','share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
		'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
		'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
		'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
		'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
		'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']

SiteScrape.getTitle = ($) => {

	let title = $('title').text().replace(removeWhitespace, " ")
	return title

}

//Finds all urls in text, removes duplicates, makes sure they have https://
SiteScrape.getCleanUrls = (textBlock) => {
	//Find all URLs in text
	//@TODO - Use the process text library for this function
	const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
	let allUrls = textBlock.match(urlPattern)

	if(allUrls == null){
		return []
	}

	//Every URL needs HTTPS!!!
	let foundUrls = []
	allUrls.forEach( (item, index) => {
		//add protocol if it is missing
		if(item.indexOf('https://') == -1 && item.indexOf('http://') == -1){
			allUrls[index] = 'https://'+item
		}
		//convert http to https
		if(item.indexOf('http://') >= 0){
			allUrls[index] = item.replace('http://','https://')
		}
	})

	//Remove all duplicates
	foundUrls = [...new Set(allUrls)]

	return foundUrls
}

//Site hostname with https:// eg: https://www.google.com
SiteScrape.getHostName = (url) => {

	var hostname = 'https://'+(new URL(url)).hostname;
	// console.log('hostname', hostname)
	return hostname
}

// URL for image that can be downloaded to represent website
SiteScrape.getDisplayImage = ($, url) => {

	const hostname = SiteScrape.getHostName(url)

	let metaImg = $('[property="og:image"]')
	let shortcutIcon = $('[rel="shortcut icon"]')
	let favicon = $('[rel="icon"]')
	let randomImg = $('img')

	//Set of images we may want gathered from various places in source
	let imagesWeWant = []
	let thumbnail = ''

	//Scrape metadata for page image
	if(randomImg && randomImg.length > 0){

		let imgSrcs = []
		for (let i = 0; i < randomImg.length; i++) {
			imgSrcs.push( randomImg[i].attribs.src )
		}

		const half = Math.ceil(imgSrcs.length / 2)
		imagesWeWant = [...imgSrcs.slice(-half), ...imgSrcs.slice(0,half) ]

	}
	//Grab the shortcut icon
	if(favicon && favicon[0] && favicon[0].attribs){
		imagesWeWant.push(favicon[0].attribs.href)
	}
	//Grab the shortcut icon
	if(shortcutIcon && shortcutIcon[0] && shortcutIcon[0].attribs){
		imagesWeWant.push(shortcutIcon[0].attribs.href)
	}
	//Grab the presentation image for the site
	if(metaImg && metaImg[0] && metaImg[0].attribs){
		imagesWeWant.unshift(metaImg[0].attribs.content)
	}

	// console.log(imagesWeWant)

	//Remove everything that isn't an accepted file format
	for (let i = imagesWeWant.length - 1; i >= 0; i--) {

		let img = String(imagesWeWant[i])

		if(
			!img.includes('.jpg') &&
			!img.includes('.jpeg') &&
			!img.includes('.png') &&
			!img.includes('.gif')
		){
			imagesWeWant.splice(i,1)
		}
	}

	//Find if we have absolute thumbnails or not
	let foundAbsolute = false
	for (let i = imagesWeWant.length - 1; i >= 0; i--) {

		let img = imagesWeWant[i]

		//Add host name if its not included
		if(String(img).includes('//') || String(img).includes('http')){
			foundAbsolute = true
			break
		}
	}

	//Go through all found images. Grab the one closest to the top. Closer is better
	for (let i = imagesWeWant.length - 1; i >= 0; i--) {

		let img = imagesWeWant[i]

		if(!String(img).includes('//') && foundAbsolute){
			continue;
		}

		//Only add host to images if no absolute images were found
		if(!String(img).includes('//') ){
			if(img.indexOf('/') != 0){
				img = '/' + img
			}
			img = hostname + img
		}

		if(img.indexOf('//') == 0){
			img = 'https:' + img //Scrape breaks without protocol
		}

		thumbnail = img

	}

	return thumbnail
}

// Get all the site text and parse out the words that appear most
SiteScrape.getKeywords = ($) => {

	let majorContent = ''

	majorContent += $('[class*=content]').text()
		.replace(removeWhitespace, " ") //Remove all whitespace
		// .replace(/\W\s/g, '') //Remove all non alphanumeric characters
		.substring(0,6000) //Limit to 6000 characters
		.toLowerCase()
		.replace(/[^A-Za-z0-9- ]/g, '');


	console.log(majorContent)

	//Count frequency of each word in scraped text
	let frequency = {}
	majorContent.split(' ').forEach(word => {
		// Exclude short or common words
		if(commonWords.includes(word) || word.length <= 2){
			return
		}
		if(!frequency[word]){
			frequency[word] = 0
		}
		// Skip some plurals
		if(frequency[word+'s'] || frequency[word+'es']){
			return
		}
		frequency[word]++
	})

	//Create a sortable array
	var sortable = [];
	for (var index in frequency) {
		if(frequency[index] > 1){
			sortable.push([index, frequency[index]]);
		}
	}

	//Sort them by most used words in the list
	sortable.sort(function(a, b) {
		return b[1] - a[1];
	});

	let finalWords = []
	for(let i=0; i<6; i++){
		if(sortable[i] && sortable[i][0]){
			finalWords.push(sortable[i][0])
		}
	}

	if(finalWords.length > 0){
		return 'Keywords: ' + finalWords.join(', ')
	}
	return ''
}

SiteScrape.getMainText = ($) => {}