SolidScribe/server/models/Attachment.js

let db = require('@config/database')

let Attachment = module.exports = {}

const cheerio = require('cheerio');
const rp = require('request-promise');

Attachment.forNote = (userId, noteId) => {
	return new Promise((resolve, reject) => {
		db.promise()
			.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1;`, [userId, noteId])
			.then((rows, fields) => {
				resolve(rows[0]) //Return all tags found by query
			})
		.catch(console.log)
	})
}

Attachment.delete = (attachmentId) => {
	return new Promise((resolve, reject) => {
		db.promise()
			.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
			.then((rows, fields) => {
				resolve(rows[0]) //Return all tags found by query
			})
		.catch(console.log)
	})
}

Attachment.scanTextForWebsites = (userId, noteId, noteText) => {
	return new Promise((resolve, reject) => {

		let solrAttachmentText = '' //Final searchable scrape text for note

		if(noteText.length == 0){ resolve(solrAttachmentText) }

		Attachment.forNote(userId, noteId).then(attachments => {

			//Find all URLs in text
			const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
			let allUrls = noteText.match(urlPattern)

			//Remove all duplicates
			let foundUrls = [...new Set(allUrls)]

			//Go through each attachment, check for existing URLs
			attachments.forEach(attachment => {
				//URL already scraped, push text and continue
				let urlIndex = foundUrls.indexOf( attachment.url )

				if(urlIndex != -1){
					solrAttachmentText += attachment.text
					foundUrls.splice(urlIndex, 1) //Remove existing from set of found
				} else {
					Attachment.delete(attachment.id)
				}
			})

			//No newly scraped URLs, resolve with looked up attachment text
			if(foundUrls == null || foundUrls.length == 0){
				resolve(solrAttachmentText)
			}

			//Process the remaining URLs into attachments
			Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {

				solrAttachmentText += freshlyScrapedText
				resolve(solrAttachmentText)
			})
		})
	})
}

//Return scraped text from each URL
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
	return new Promise((resolve, reject) => {

		console.log('About to scrape')
		console.log(foundUrls)

		if(foundUrls == null || foundUrls.length == 0){resolve('')}

		let processedCount = 0
		let scrapedText = ''

		//Process each URL passd to function, a DB entry will be created for each scrape
		foundUrls.forEach(url => {
			Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {
				
				scrapedText += freshlyScrapedText
				processedCount ++

				//All URLs have been scraped, return data
				if(processedCount == foundUrls.length){
					resolve(scrapedText)
				}
			})
		})
	})
}


Attachment.processUrl = (userId, noteId, url) => {
	return new Promise((resolve, reject) => {

		const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
		'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
		'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
		'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
		'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
		'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']

		var removeWhitespace = /\s+/g

		// console.log('Scraping ', website)
		const options = {
			uri: url,
			transform: function (body) {
				return cheerio.load(body);
			}
		}

		rp(options).then($ => {

			var desiredSearchText = ''

			let pageTitle = $('title').text().replace(removeWhitespace, " ")
			desiredSearchText += pageTitle + "\n"

			let header = $('h1').text().replace(removeWhitespace, " ")
			desiredSearchText += header + "\n"

			let majorContent = ''
			majorContent += $('[class*=content]').text()
				.replace(removeWhitespace, " ") //Remove all whitespace
				.replace(/\W\s/g, '') //Remove all non alphanumeric characters
				.substring(0,3000)
				.toLowerCase()
			majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
				.replace(removeWhitespace, " ") //Remove all whitespace
				.replace(/\W\s/g, '') //Remove all non alphanumeric characters
				.substring(0,3000) //Limit characters
				.toLowerCase()

			//Count frequency of each word in scraped text
			let frequency = {}
			majorContent.split(' ').forEach(word => {
				if(excludeWords.includes(word)){
					return //Exclude certain words
				}
				if(!frequency[word]){
					frequency[word] = 0
				}
				frequency[word]++
			})

			//Create a sortable array
			var sortable = [];
			for (var index in frequency) {
				if(frequency[index] > 1){
					sortable.push([index, frequency[index]]);
				}
			}

			//Sort them by most used words in the list
			sortable.sort(function(a, b) {
				return b[1] - a[1];
			});

			let finalWords = []
			for(let i=0; i<15; i++){
				if(sortable[i] && sortable[i][0]){
					finalWords.push(sortable[i][0]) 
				}
			}

			desiredSearchText += finalWords.join(', ')
			console.log('TexT Scraped')
			console.log(desiredSearchText)

			const created = Math.round((+new Date)/1000)

			//Create attachment in DB with scrape text and provided data
			db.promise()
			.query(`INSERT INTO attachment 
				(note_id, user_id, attachment_type, text, url, last_indexed) 
				VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
			.then((rows, fields) => {
				resolve(desiredSearchText) //Return found text
			})
			.catch(console.log)

		})
	})
}
Searching, url indexing * Added a help page * Cleaned up home and login pages * Menu is hidden when on notes section of app * Added username to login data * Notes now change to the color selected for the note * Note save function has a 500ms debounce to prevent spamming * Solr results now displays content from notes, tags and attachments * All note data is now indexed in solr * Notes containing URLs are now scraped and put into tag solr index * Attachments that are removed from note are deleted when url is removed * Minor little tweaks and fixes all over the place 2019-07-24 11:06:50 -07:00			`let db = require('@config/database')`

			`let Attachment = module.exports = {}`

			`const cheerio = require('cheerio');`
			`const rp = require('request-promise');`

			`Attachment.forNote = (userId, noteId) => {`
			`return new Promise((resolve, reject) => {`
			`db.promise()`
			.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1;`, [userId, noteId])
			`.then((rows, fields) => {`
			`resolve(rows[0]) //Return all tags found by query`
			`})`
			`.catch(console.log)`
			`})`
			`}`

			`Attachment.delete = (attachmentId) => {`
			`return new Promise((resolve, reject) => {`
			`db.promise()`
			.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
			`.then((rows, fields) => {`
			`resolve(rows[0]) //Return all tags found by query`
			`})`
			`.catch(console.log)`
			`})`
			`}`

			`Attachment.scanTextForWebsites = (userId, noteId, noteText) => {`
			`return new Promise((resolve, reject) => {`

			`let solrAttachmentText = '' //Final searchable scrape text for note`

			`if(noteText.length == 0){ resolve(solrAttachmentText) }`

			`Attachment.forNote(userId, noteId).then(attachments => {`

			`//Find all URLs in text`
			`const urlPattern = /(?:(?:https?\|ftp\|file):\/\/\|www\.\|ftp\.)(?:\([-A-Z0-9+&@#/%=~_\|$?!:,.]\)\|[-A-Z0-9+&@#/%=~_\|$?!:,.])(?:\([-A-Z0-9+&@#/%=~_\|$?!:,.]*\)\|[A-Z0-9+&@#/%=~_\|$])/igm`
Added a night mode and no way to toggle it! Tweaked a lot of styles and added some cool animations Added a little to the help text Quickly adding a note, saving and closing no longer causes half formed or empty notes to appear Close Editor animation Display cards text show at the bottom of card Added a delete function, and it works Added browser title attributes More debugging and error checking on scraped links Updated not search to display title and text below the title 2019-07-29 00:22:47 -07:00			`let allUrls = noteText.match(urlPattern)`

			`//Remove all duplicates`
			`let foundUrls = [...new Set(allUrls)]`
Searching, url indexing * Added a help page * Cleaned up home and login pages * Menu is hidden when on notes section of app * Added username to login data * Notes now change to the color selected for the note * Note save function has a 500ms debounce to prevent spamming * Solr results now displays content from notes, tags and attachments * All note data is now indexed in solr * Notes containing URLs are now scraped and put into tag solr index * Attachments that are removed from note are deleted when url is removed * Minor little tweaks and fixes all over the place 2019-07-24 11:06:50 -07:00
			`//Go through each attachment, check for existing URLs`
			`attachments.forEach(attachment => {`
			`//URL already scraped, push text and continue`
			`let urlIndex = foundUrls.indexOf( attachment.url )`

			`if(urlIndex != -1){`
			`solrAttachmentText += attachment.text`
			`foundUrls.splice(urlIndex, 1) //Remove existing from set of found`
			`} else {`
			`Attachment.delete(attachment.id)`
			`}`
			`})`

			`//No newly scraped URLs, resolve with looked up attachment text`
Added a night mode and no way to toggle it! Tweaked a lot of styles and added some cool animations Added a little to the help text Quickly adding a note, saving and closing no longer causes half formed or empty notes to appear Close Editor animation Display cards text show at the bottom of card Added a delete function, and it works Added browser title attributes More debugging and error checking on scraped links Updated not search to display title and text below the title 2019-07-29 00:22:47 -07:00			`if(foundUrls == null \|\| foundUrls.length == 0){`
Searching, url indexing * Added a help page * Cleaned up home and login pages * Menu is hidden when on notes section of app * Added username to login data * Notes now change to the color selected for the note * Note save function has a 500ms debounce to prevent spamming * Solr results now displays content from notes, tags and attachments * All note data is now indexed in solr * Notes containing URLs are now scraped and put into tag solr index * Attachments that are removed from note are deleted when url is removed * Minor little tweaks and fixes all over the place 2019-07-24 11:06:50 -07:00			`resolve(solrAttachmentText)`
			`}`

			`//Process the remaining URLs into attachments`
			`Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {`

			`solrAttachmentText += freshlyScrapedText`
			`resolve(solrAttachmentText)`
			`})`
			`})`
			`})`
			`}`

			`//Return scraped text from each URL`
			`Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {`
			`return new Promise((resolve, reject) => {`

			`console.log('About to scrape')`
			`console.log(foundUrls)`

			`if(foundUrls == null \|\| foundUrls.length == 0){resolve('')}`

			`let processedCount = 0`
			`let scrapedText = ''`

			`//Process each URL passd to function, a DB entry will be created for each scrape`
			`foundUrls.forEach(url => {`
			`Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {`

			`scrapedText += freshlyScrapedText`
			`processedCount ++`

			`//All URLs have been scraped, return data`
			`if(processedCount == foundUrls.length){`
			`resolve(scrapedText)`
			`}`
			`})`
			`})`
			`})`
			`}`


			`Attachment.processUrl = (userId, noteId, url) => {`
			`return new Promise((resolve, reject) => {`

			`const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',`
			`'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',`
			`'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',`
			`'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',`
			`'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',`
			`'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']`

			`var removeWhitespace = /\s+/g`

			`// console.log('Scraping ', website)`
			`const options = {`
			`uri: url,`
			`transform: function (body) {`
			`return cheerio.load(body);`
			`}`
			`}`

			`rp(options).then($ => {`

			`var desiredSearchText = ''`

			`let pageTitle = $('title').text().replace(removeWhitespace, " ")`
			`desiredSearchText += pageTitle + "\n"`

			`let header = $('h1').text().replace(removeWhitespace, " ")`
			`desiredSearchText += header + "\n"`

			`let majorContent = ''`
			`majorContent += $('[class*=content]').text()`
			`.replace(removeWhitespace, " ") //Remove all whitespace`
			`.replace(/\W\s/g, '') //Remove all non alphanumeric characters`
			`.substring(0,3000)`
			`.toLowerCase()`
			`majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")`
			`.replace(removeWhitespace, " ") //Remove all whitespace`
			`.replace(/\W\s/g, '') //Remove all non alphanumeric characters`
			`.substring(0,3000) //Limit characters`
			`.toLowerCase()`

			`//Count frequency of each word in scraped text`
			`let frequency = {}`
			`majorContent.split(' ').forEach(word => {`
			`if(excludeWords.includes(word)){`
			`return //Exclude certain words`
			`}`
			`if(!frequency[word]){`
			`frequency[word] = 0`
			`}`
			`frequency[word]++`
			`})`

			`//Create a sortable array`
			`var sortable = [];`
			`for (var index in frequency) {`
			`if(frequency[index] > 1){`
			`sortable.push([index, frequency[index]]);`
			`}`
			`}`

			`//Sort them by most used words in the list`
			`sortable.sort(function(a, b) {`
			`return b[1] - a[1];`
			`});`

			`let finalWords = []`
			`for(let i=0; i<15; i++){`
Added a night mode and no way to toggle it! Tweaked a lot of styles and added some cool animations Added a little to the help text Quickly adding a note, saving and closing no longer causes half formed or empty notes to appear Close Editor animation Display cards text show at the bottom of card Added a delete function, and it works Added browser title attributes More debugging and error checking on scraped links Updated not search to display title and text below the title 2019-07-29 00:22:47 -07:00			`if(sortable[i] && sortable[i][0]){`
Searching, url indexing * Added a help page * Cleaned up home and login pages * Menu is hidden when on notes section of app * Added username to login data * Notes now change to the color selected for the note * Note save function has a 500ms debounce to prevent spamming * Solr results now displays content from notes, tags and attachments * All note data is now indexed in solr * Notes containing URLs are now scraped and put into tag solr index * Attachments that are removed from note are deleted when url is removed * Minor little tweaks and fixes all over the place 2019-07-24 11:06:50 -07:00			`finalWords.push(sortable[i][0])`
			`}`
			`}`

			`desiredSearchText += finalWords.join(', ')`
Added a night mode and no way to toggle it! Tweaked a lot of styles and added some cool animations Added a little to the help text Quickly adding a note, saving and closing no longer causes half formed or empty notes to appear Close Editor animation Display cards text show at the bottom of card Added a delete function, and it works Added browser title attributes More debugging and error checking on scraped links Updated not search to display title and text below the title 2019-07-29 00:22:47 -07:00			`console.log('TexT Scraped')`
			`console.log(desiredSearchText)`
Searching, url indexing * Added a help page * Cleaned up home and login pages * Menu is hidden when on notes section of app * Added username to login data * Notes now change to the color selected for the note * Note save function has a 500ms debounce to prevent spamming * Solr results now displays content from notes, tags and attachments * All note data is now indexed in solr * Notes containing URLs are now scraped and put into tag solr index * Attachments that are removed from note are deleted when url is removed * Minor little tweaks and fixes all over the place 2019-07-24 11:06:50 -07:00
			`const created = Math.round((+new Date)/1000)`

			`//Create attachment in DB with scrape text and provided data`
			`db.promise()`
			.query(`INSERT INTO attachment
			`(note_id, user_id, attachment_type, text, url, last_indexed)`
			VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
			`.then((rows, fields) => {`
			`resolve(desiredSearchText) //Return found text`
			`})`
			`.catch(console.log)`

			`})`
			`})`
			`}`