let db = require('@config/database') let Attachment = module.exports = {} const cheerio = require('cheerio') const rp = require('request-promise') const request = require('request') const fs = require('fs') Attachment.search = (userId, noteId) => { return new Promise((resolve, reject) => { // Attachment.downloadFileFromUrl('https://i.imgur.com/5PVufWa.jpg') let params = [userId] let query = 'SELECT * FROM attachment WHERE user_id = ? ' if(noteId && noteId > 0){ query += 'AND note_id = ? ' params.push(noteId) } query += 'ORDER BY last_indexed DESC ' db.promise() .query(query, params) .then((rows, fields) => { resolve(rows[0]) //Return all attachments found by query }) .catch(console.log) }) } //Returns all attachments Attachment.forNote = (userId, noteId) => { return new Promise((resolve, reject) => { db.promise() .query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? ORDER BY last_indexed DESC;`, [userId, noteId]) .then((rows, fields) => { resolve(rows[0]) //Return all attachments found by query }) .catch(console.log) }) } Attachment.urlForNote = (userId, noteId) => { return new Promise((resolve, reject) => { db.promise() .query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1 ORDER BY last_indexed DESC;`, [userId, noteId]) .then((rows, fields) => { resolve(rows[0]) //Return all attachments found by query }) .catch(console.log) }) } //Update attachment in database Attachment.update = (userId, attachmentId, updatedText, noteId) => { return new Promise((resolve, reject) => { db.promise() .query(`UPDATE attachment SET text = ? WHERE id = ? AND user_id = ?`, [updatedText, attachmentId, userId]) .then((rows, fields) => { resolve(true) }) .catch(console.log) }) } Attachment.delete = (attachmentId) => { console.log('Delete Attachment', attachmentId) return new Promise((resolve, reject) => { db.promise() .query(`DELETE FROM attachment WHERE id = ?`, [attachmentId]) .then((rows, fields) => { resolve(rows[0]) //Return all tags found by query }) .catch(console.log) }) } Attachment.processUploadedFile = (userId, noteId, fileObject) => { return new Promise((resolve, reject) => { const created = Math.round((+new Date)/1000) const fileLocation = fileObject.filename const fileName = fileObject.originalname // console.log('Adding file') // console.log( [noteId, userId, 2, fileName, created, fileLocation] ) //Create attachment in DB with scrape text and provided data db.promise() .query(` INSERT INTO attachment (note_id, user_id, attachment_type, \`text\`, last_indexed, file_location) VALUES (?, ?, ?, ?, ?, ?) `, [noteId, userId, 2, fileName, created, fileLocation]) .then((rows, fields) => { console.log('Created attachment for ',fileName) resolve({ fileName, fileLocation }) //Return found text }) .catch(console.log) }) } //Scans text for websites, returns all attachments Attachment.scanTextForWebsites = (userId, noteId, noteText) => { return new Promise((resolve, reject) => { let solrAttachmentText = '' //Final searchable scrape text for note if(noteText.length == 0){ resolve(solrAttachmentText) } Attachment.urlForNote(userId, noteId).then(attachments => { //Find all URLs in text //@TODO - Use the process text library for this function const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm let allUrls = noteText.match(urlPattern) //Remove all duplicates let foundUrls = [...new Set(allUrls)] //Go through each attachment, check for existing URLs attachments.forEach(attachment => { //URL already scraped, push text and continue let urlIndex = foundUrls.indexOf( attachment.url ) if(urlIndex != -1){ solrAttachmentText += attachment.text foundUrls.splice(urlIndex, 1) //Remove existing from set of found } else { //If existing attachment is not found in note, remove it Attachment.delete(attachment.id) } }) //No newly scraped URLs, resolve with looked up attachment text if(foundUrls == null || foundUrls.length == 0){ resolve(solrAttachmentText) } //Process the remaining URLs into attachments Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => { solrAttachmentText += freshlyScrapedText resolve(solrAttachmentText) }) }) }) } //Return scraped text from each URL Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => { return new Promise((resolve, reject) => { if(foundUrls == null || foundUrls.length == 0){ return resolve('') } console.log('About to scrape') console.log(foundUrls) let processedCount = 0 let scrapedText = '' //Process each URL passd to function, a DB entry will be created for each scrape foundUrls.forEach(url => { Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => { scrapedText += freshlyScrapedText processedCount ++ //All URLs have been scraped, return data if(processedCount == foundUrls.length){ resolve(scrapedText) } }) }) }) } Attachment.downloadFileFromUrl = (url) => { //File Path return new Promise((resolve, reject) => { const random = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15) const filePath = '../staticFiles/' let fileName = filePath + random + '_img' console.log('Getting ready to scrape ', url) request(url) .on('error', error => { console.log(error) resolve(null) }) .on('response', res => { console.log(res.statusCode) console.log(res.headers['content-type']) }) .pipe(fs.createWriteStream(fileName)) .on('close', () => { console.log('Saved Image') resolve(random + '_img') }) }) } Attachment.processUrl = (userId, noteId, url) => { const scrapeTime = 20*1000; return new Promise((resolve, reject) => { const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want', 'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old', 'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on', 'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her', 'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after', 'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its'] var removeWhitespace = /\s+/g // console.log('Scraping ', website) const options = { uri: url, simple: true, timeout: scrapeTime, headers: { 'User-Agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' //Simulate google headers }, transform: function (body) { return cheerio.load(body); } } let requestTimeout = null let thumbnail = null let request = rp(options) .then($ => { clearTimeout(requestTimeout) var desiredSearchText = '' let pageTitle = $('title').text().replace(removeWhitespace, " ") desiredSearchText += pageTitle + "\n" let header = $('h1').text().replace(removeWhitespace, " ") desiredSearchText += header + "\n" let metadata = $('meta[property="og:image"]') //'meta[property="og:image"]' .conten() console.log('Scrape metadata') // console.log(metadata) if(metadata && metadata[0] && metadata[0].attribs){ console.log('Found metadata image') console.log(metadata[0].attribs.content) thumbnail = metadata[0].attribs.content } let majorContent = '' majorContent += $('[class*=content]').text() .replace(removeWhitespace, " ") //Remove all whitespace .replace(/\W\s/g, '') //Remove all non alphanumeric characters .substring(0,3000) .toLowerCase() majorContent += $('[id*=content]').text().replace(removeWhitespace, " ") .replace(removeWhitespace, " ") //Remove all whitespace .replace(/\W\s/g, '') //Remove all non alphanumeric characters .substring(0,3000) //Limit characters .toLowerCase() //Count frequency of each word in scraped text let frequency = {} majorContent.split(' ').forEach(word => { if(excludeWords.includes(word)){ return //Exclude certain words } if(!frequency[word]){ frequency[word] = 0 } frequency[word]++ }) //Create a sortable array var sortable = []; for (var index in frequency) { if(frequency[index] > 1){ sortable.push([index, frequency[index]]); } } //Sort them by most used words in the list sortable.sort(function(a, b) { return b[1] - a[1]; }); let finalWords = [] for(let i=0; i<15; i++){ if(sortable[i] && sortable[i][0]){ finalWords.push(sortable[i][0]) } } desiredSearchText += finalWords.join(', ') console.log('TexT Scraped') console.log(desiredSearchText) const created = Math.round((+new Date)/1000) //Scrape URL for thumbnail - take filename and save in attachment Attachment.downloadFileFromUrl(thumbnail) .then(thumbnailFilename => { //Create attachment in DB with scrape text and provided data db.promise() .query(`INSERT INTO attachment (note_id, user_id, attachment_type, text, url, last_indexed, file_location) VALUES (?, ?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created, thumbnailFilename]) .then((rows, fields) => { resolve(desiredSearchText) //Return found text }) .catch(console.log) }) }) .catch(error => { console.log('Issue with scrape') console.log(error) resolve('') }) requestTimeout = setTimeout( () => { console.log('Cancel the request, its taking to long.') request.cancel() desiredSearchText = url const created = Math.round((+new Date)/1000) //Create attachment in DB with scrape text and provided data db.promise() .query(`INSERT INTO attachment (note_id, user_id, attachment_type, text, url, last_indexed) VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created]) .then((rows, fields) => { resolve(desiredSearchText) //Return found text }) .catch(console.log) }, (scrapeTime)) }) }