SolidScribe/server/models/Attachment.js

let db = require('@config/database')

let Attachment = module.exports = {}

const cheerio = require('cheerio')
const rp = require('request-promise')
const request = require('request')
const fs = require('fs')

Attachment.search = (userId, noteId) => {
	return new Promise((resolve, reject) => {

		// Attachment.downloadFileFromUrl('https://i.imgur.com/5PVufWa.jpg')

		let params = [userId]
		let query = 'SELECT * FROM attachment WHERE user_id = ? '

		if(noteId && noteId > 0){
			query += 'AND note_id = ? '
			params.push(noteId)
		}

		query += 'ORDER BY last_indexed DESC '

		db.promise()
			.query(query, params)
			.then((rows, fields) => {
				resolve(rows[0]) //Return all attachments found by query
			})
		.catch(console.log)
	})
}

//Returns all attachments
Attachment.forNote = (userId, noteId) => {
	return new Promise((resolve, reject) => {
		db.promise()
			.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? ORDER BY last_indexed DESC;`, [userId, noteId])
			.then((rows, fields) => {
				resolve(rows[0]) //Return all attachments found by query
			})
		.catch(console.log)
	})
}

Attachment.urlForNote = (userId, noteId) => {
	return new Promise((resolve, reject) => {
		db.promise()
			.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1 ORDER BY last_indexed DESC;`, [userId, noteId])
			.then((rows, fields) => {
				resolve(rows[0]) //Return all attachments found by query
			})
		.catch(console.log)
	})
}

//Update attachment in database
Attachment.update = (userId, attachmentId, updatedText, noteId) => {
	return new Promise((resolve, reject) => {
		db.promise()
			.query(`UPDATE attachment SET text = ? WHERE id = ? AND user_id = ?`,
				[updatedText, attachmentId, userId])
			.then((rows, fields) => {
				resolve(true)
			})
		.catch(console.log)
	})
}

Attachment.delete = (attachmentId) => {
	console.log('Delete Attachment', attachmentId)
	return new Promise((resolve, reject) => {
		db.promise()
			.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
			.then((rows, fields) => {
				resolve(rows[0]) //Return all tags found by query
			})
		.catch(console.log)
	})
}

Attachment.processUploadedFile = (userId, noteId, fileObject) => {
	return new Promise((resolve, reject) => {

		const created = Math.round((+new Date)/1000)
		const fileLocation = fileObject.filename
		const fileName = fileObject.originalname

		// console.log('Adding file')
		// console.log( [noteId, userId, 2, fileName, created, fileLocation] )

		//Create attachment in DB with scrape text and provided data

		db.promise()
		.query(`
			INSERT INTO attachment
				(note_id, user_id, attachment_type, \`text\`, last_indexed, file_location)
			VALUES
				(?, ?, ?, ?, ?, ?)
		`, [noteId, userId, 2, fileName, created, fileLocation])
		.then((rows, fields) => {
			console.log('Created attachment for ',fileName)
			resolve({ fileName, fileLocation }) //Return found text
		})
		.catch(console.log)
	})
}

//Scans text for websites, returns all attachments
Attachment.scanTextForWebsites = (userId, noteId, noteText) => {
	return new Promise((resolve, reject) => {

		let solrAttachmentText = '' //Final searchable scrape text for note

		if(noteText.length == 0){ resolve(solrAttachmentText) }

		Attachment.urlForNote(userId, noteId).then(attachments => {

			//Find all URLs in text
			//@TODO - Use the process text library for this function
			const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
			let allUrls = noteText.match(urlPattern)

			//Remove all duplicates
			let foundUrls = [...new Set(allUrls)]

			//Go through each attachment, check for existing URLs
			attachments.forEach(attachment => {
				//URL already scraped, push text and continue
				let urlIndex = foundUrls.indexOf( attachment.url )

				if(urlIndex != -1){
					solrAttachmentText += attachment.text
					foundUrls.splice(urlIndex, 1) //Remove existing from set of found
				} else {
					//If existing attachment is not found in note, remove it
					Attachment.delete(attachment.id)
				}
			})

			//No newly scraped URLs, resolve with looked up attachment text
			if(foundUrls == null || foundUrls.length == 0){
				resolve(solrAttachmentText)
			}

			//Process the remaining URLs into attachments
			Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {

				solrAttachmentText += freshlyScrapedText
				resolve(solrAttachmentText)
			})
		})
	})
}

//Return scraped text from each URL
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
	return new Promise((resolve, reject) => {

		if(foundUrls == null || foundUrls.length == 0){
			return resolve('')
		}

		console.log('About to scrape')
		console.log(foundUrls)

		let processedCount = 0
		let scrapedText = ''

		//Process each URL passd to function, a DB entry will be created for each scrape
		foundUrls.forEach(url => {
			Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {

				scrapedText += freshlyScrapedText
				processedCount ++

				//All URLs have been scraped, return data
				if(processedCount == foundUrls.length){
					resolve(scrapedText)
				}
			})
		})
	})
}


Attachment.downloadFileFromUrl = (url) => {

	//File Path


	return new Promise((resolve, reject) => {


			const random = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15)
			const filePath = '../staticFiles/'
			let fileName = filePath + random + '_img'

			console.log('Getting ready to scrape ', url)

			request(url)
				.on('error', error => {
					console.log(error)
					resolve(null)
				})
				.on('response', res => {
					console.log(res.statusCode)
					console.log(res.headers['content-type'])
				})
				.pipe(fs.createWriteStream(fileName))
				.on('close', () => {
					console.log('Saved Image')
					resolve(random + '_img')
				})
	})
}


Attachment.processUrl = (userId, noteId, url) => {

	const scrapeTime = 20*1000;

	return new Promise((resolve, reject) => {

		const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
		'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
		'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
		'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
		'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
		'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']

		var removeWhitespace = /\s+/g


		// console.log('Scraping ', website)
		const options = {
			uri: url,
			simple: true,
			timeout: scrapeTime,
			headers: {
				'User-Agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' //Simulate google headers
			},
			transform: function (body) {
				return cheerio.load(body);
			}
		}

		let requestTimeout = null
		let thumbnail = null

		let request = rp(options)
		.then($ => {

			clearTimeout(requestTimeout)

			var desiredSearchText = ''

			let pageTitle = $('title').text().replace(removeWhitespace, " ")
			desiredSearchText += pageTitle + "\n"

			let header = $('h1').text().replace(removeWhitespace, " ")
			desiredSearchText += header + "\n"

			let metadata = $('meta[property="og:image"]')
			//'meta[property="og:image"]' .conten()
			console.log('Scrape metadata')
			// console.log(metadata)
			if(metadata && metadata[0] && metadata[0].attribs){
				console.log('Found metadata image')
				console.log(metadata[0].attribs.content)
				thumbnail = metadata[0].attribs.content
			}


			let majorContent = ''
			majorContent += $('[class*=content]').text()
				.replace(removeWhitespace, " ") //Remove all whitespace
				.replace(/\W\s/g, '') //Remove all non alphanumeric characters
				.substring(0,3000)
				.toLowerCase()
			majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
				.replace(removeWhitespace, " ") //Remove all whitespace
				.replace(/\W\s/g, '') //Remove all non alphanumeric characters
				.substring(0,3000) //Limit characters
				.toLowerCase()

			//Count frequency of each word in scraped text
			let frequency = {}
			majorContent.split(' ').forEach(word => {
				if(excludeWords.includes(word)){
					return //Exclude certain words
				}
				if(!frequency[word]){
					frequency[word] = 0
				}
				frequency[word]++
			})

			//Create a sortable array
			var sortable = [];
			for (var index in frequency) {
				if(frequency[index] > 1){
					sortable.push([index, frequency[index]]);
				}
			}

			//Sort them by most used words in the list
			sortable.sort(function(a, b) {
				return b[1] - a[1];
			});

			let finalWords = []
			for(let i=0; i<15; i++){
				if(sortable[i] && sortable[i][0]){
					finalWords.push(sortable[i][0])
				}
			}

			desiredSearchText += finalWords.join(', ')
			console.log('TexT Scraped')
			console.log(desiredSearchText)

			const created = Math.round((+new Date)/1000)

			//Scrape URL for thumbnail - take filename and save in attachment
			Attachment.downloadFileFromUrl(thumbnail)
			.then(thumbnailFilename => {

				//Create attachment in DB with scrape text and provided data
				db.promise()
				.query(`INSERT INTO attachment
					(note_id, user_id, attachment_type, text, url, last_indexed, file_location)
					VALUES (?, ?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created, thumbnailFilename])
				.then((rows, fields) => {

					resolve(desiredSearchText) //Return found text
				})
				.catch(console.log)
			})

		})
		.catch(error => {
			console.log('Issue with scrape')
			console.log(error)
			resolve('')
		})

		requestTimeout = setTimeout( () => {
			console.log('Cancel the request, its taking to long.')
			request.cancel()

			desiredSearchText = url
			const created = Math.round((+new Date)/1000)

			//Create attachment in DB with scrape text and provided data
			db.promise()
			.query(`INSERT INTO attachment
				(note_id, user_id, attachment_type, text, url, last_indexed)
				VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
			.then((rows, fields) => {
				resolve(desiredSearchText) //Return found text
			})
			.catch(console.log)

		}, (scrapeTime))
	})
}