* Added some better base information to site for scrapers

* Updated help text * Refactored a lot of the scrape code into a SiteScrape helper
2020-04-13 06:17:37 +00:00
parent a3fa4b0f3c
commit 3535f0cb24
7 changed files with 208 additions and 99 deletions
--- a/server/models/Attachment.js
+++ b/server/models/Attachment.js
@@ -1,5 +1,7 @@
 let db = require('@config/database')

+let SiteScrape = require('@helpers/SiteScrape')
+
 let Attachment = module.exports = {}

 const cheerio = require('cheerio')
@@ -242,32 +244,8 @@ Attachment.scanTextForWebsites = (io, userId, noteId, noteText) => {

 		Attachment.urlForNote(userId, noteId).then(attachments => {

-			//Find all URLs in text
-			//@TODO - Use the process text library for this function
-			const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
-			let allUrls = noteText.match(urlPattern)
-
-			if(allUrls == null){
-				allUrls = []
-			}
-
-			//Every URL needs HTTPS!!!
-			let foundUrls = []
-			allUrls.forEach( (item, index) => {
-				//Every URL should have HTTPS
-				if(item.indexOf('https://') == -1 && item.indexOf('http://') == -1){
-					allUrls[index] = 'https://'+item
-				}
-				//URLs should all have HTTPS!!!
-				if(item.indexOf('http://') >= 0){
-					allUrls[index] = item.replace('http://','https://')
-				}
-			})
-
-			//Remove all duplicates
-			foundUrls = [...new Set(allUrls)]
-
-
+			//Pull all the URLs out of the text
+			let foundUrls = SiteScrape.getCleanUrls(noteText)

 			//Go through each saved URL, remove new URLs from saved URLs
 			//If a URL is not found, delete it
@@ -386,14 +364,6 @@ Attachment.processUrl = (userId, noteId, url) => {

 	return new Promise((resolve, reject) => {

-		const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
-		'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
-		'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
-		'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
-		'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
-		'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']
-
-		var removeWhitespace = /\s+/g

 		const options = {
 			uri: url,
@@ -428,70 +398,33 @@ Attachment.processUrl = (userId, noteId, url) => {
 		})
 		.then($ => {

+			//Clear timeout that would end this function
 			clearTimeout(requestTimeout)

-			var desiredSearchText = ''
-
-			let pageTitle = $('title').text().replace(removeWhitespace, " ")
-			desiredSearchText += pageTitle + "\n"
-
 			// let header = $('h1').text().replace(removeWhitespace, " ")
 			// desiredSearchText += header + "\n"

-			//Scrape metadata for page image
-			let metadata = $('meta[property="og:image"]')
-			if(metadata && metadata[0] && metadata[0].attribs){
-				thumbnail = metadata[0].attribs.content
-			}
+			const pageTitle = SiteScrape.getTitle($)

+			const hostname = SiteScrape.getHostName(url)

-			let majorContent = ''
-			majorContent += $('[class*=content]').text()
-				.replace(removeWhitespace, " ") //Remove all whitespace
-				.replace(/\W\s/g, '') //Remove all non alphanumeric characters
-				.substring(0,3000)
-				.toLowerCase()
-			majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
-				.replace(removeWhitespace, " ") //Remove all whitespace
-				.replace(/\W\s/g, '') //Remove all non alphanumeric characters
-				.substring(0,3000) //Limit characters
-				.toLowerCase()
+			const thumbnail = SiteScrape.getDisplayImage($, url)

-			//Count frequency of each word in scraped text
-			let frequency = {}
-			majorContent.split(' ').forEach(word => {
-				if(excludeWords.includes(word)){
-					return //Exclude certain words
-				}
-				if(!frequency[word]){
-					frequency[word] = 0
-				}
-				frequency[word]++
+			const keywords = SiteScrape.getKeywords($)
+
+			var desiredSearchText = ''
+			desiredSearchText += pageTitle + "\n"
+			desiredSearchText += keywords
+
+			console.log({
+				pageTitle,
+				hostname,
+				thumbnail,
+				keywords
 			})
+			

-			//Create a sortable array
-			var sortable = [];
-			for (var index in frequency) {
-				if(frequency[index] > 1){
-					sortable.push([index, frequency[index]]);
-				}
-			}
-
-			//Sort them by most used words in the list
-			sortable.sort(function(a, b) {
-				return b[1] - a[1];
-			});
-
-			let finalWords = []
-			for(let i=0; i<5; i++){
-				if(sortable[i] && sortable[i][0]){
-					finalWords.push(sortable[i][0]) 
-				}
-			}
-
-			if(finalWords.length > 0){
-				desiredSearchText += 'Keywords: ' + finalWords.join(', ')
-			}
+			// throw new Error('Ending this function early.')

 			
 			// console.log('TexT Scraped')
@@ -532,9 +465,10 @@ Attachment.processUrl = (userId, noteId, url) => {

 		})
 		.catch(error => {
-			console.log('Issue with scrape')
+			// console.log('Scrape pooped out')
+			// console.log('Issue with scrape')
 			console.log(error)
-			resolve('')
+			// resolve('')
 		})

 		requestTimeout = setTimeout( () => {
--- a/server/models/Note.js
+++ b/server/models/Note.js
@@ -167,10 +167,10 @@ Note.update = (io, userId, noteId, noteText, noteTitle, color, pinned, archived,
 	return new Promise((resolve, reject) => {

 		//Prevent note loss if it saves with empty text
-		if(ProcessText.removeHtml(noteText) == ''){
+		//if(ProcessText.removeHtml(noteText) == ''){
 			// console.log('Not saving empty note')
 			// resolve(false)
-		}
+		//}

 		const now = Math.round((+new Date)/1000)