let db = require('@config/database') let SiteScrape = require('@helpers/SiteScrape') const cs = require('@helpers/CryptoString') let Attachment = module.exports = {} const cheerio = require('cheerio') const rp = require('request-promise') const request = require('request') const fs = require('fs') const gm = require('gm') const tesseract = require("node-tesseract-ocr") const filePath = '../staticFiles/' // Attachment.migrateOld Attachment.textSearch = (userId, searchTerm) => { return new Promise((resolve, reject) => { const front = 5 const tail = 150 const query = ` SELECT *, substring( text, IF(LOCATE(?, text) > ${tail}, LOCATE(?, text) - ${front}, 1), ${tail} + LENGTH(?) + ${front} ) as snippet FROM attachment WHERE user_id = ? AND visible != 0 AND MATCH(text) AGAINST(? IN NATURAL LANGUAGE MODE) LIMIT 1000` db.promise() .query(query, [searchTerm, searchTerm, searchTerm, userId, searchTerm]) .then((rows, fields) => { resolve(rows[0]) //Return all attachments found by query }) .catch(console.log) }) } Attachment.search = (userId, noteId, attachmentType, offset, setSize, includeShared) => { console.log([userId, noteId, attachmentType, offset, setSize, includeShared]) return new Promise((resolve, reject) => { let params = [userId] let query = ` SELECT attachment.*, note.share_user_id FROM attachment LEFT JOIN note ON (attachment.note_id = note.id) WHERE attachment.user_id = ? AND visible = 1 ` if(noteId && noteId > 0){ // // Show everything if note ID is present // query += 'AND attachment.note_id = ? ' params.push(noteId) } else { // // Other filters if NO note id // if(attachmentType == 'links'){ query += 'AND attachment_type = 1 ' } if(attachmentType == 'files'){ query += 'AND attachment_type > 1 ' } query += `AND note.archived = ${ attachmentType == 'archived' ? '1':'0' } ` query += `AND note.trashed = ${ attachmentType == 'trashed' ? '1':'0' } ` if(!attachmentType){ // Null note ID means it was pushed by bookmarklet query += 'OR attachment.note_id IS NULL ' } } if(!noteId){ const sharedOrNot = includeShared ? ' NOT ':' ' query += `AND note.share_user_id IS${sharedOrNot}NULL ` } query += 'ORDER BY last_indexed DESC ' const limitOffset = parseInt(offset, 10) || 0 //Either parse int, or use zero const parsedSetSize = parseInt(setSize, 10) || 20 query += ` LIMIT ${limitOffset}, ${parsedSetSize}` console.log(query) db.promise() .query(query, params) .then((rows, fields) => { resolve(rows[0]) //Return all attachments found by query }) .catch(console.log) }) } Attachment.urlForNote = (userId, noteId) => { return new Promise((resolve, reject) => { db.promise() .query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1 ORDER BY last_indexed DESC;`, [userId, noteId]) .then((rows, fields) => { resolve(rows[0]) //Return all attachments found by query }) .catch(console.log) }) } //Update attachment in database Attachment.update = (userId, attachmentId, updatedText, noteId) => { return new Promise((resolve, reject) => { db.promise() .query(`UPDATE attachment SET text = ? WHERE id = ? AND user_id = ?`, [updatedText, attachmentId, userId]) .then((rows, fields) => { resolve(true) }) .catch(console.log) }) } Attachment.delete = (userId, attachmentId, urlDelete = false) => { let attachment = null let noteExists = true return new Promise((resolve, reject) => { db.promise() .query('SELECT * FROM attachment WHERE id = ? AND user_id = ? LIMIT 1', [attachmentId, userId]) .then((rows, fields) => { //Attachment doesn't exist, return done if(rows[0].length == 0){ return resolve(true) } attachment = rows[0][0] return db.promise().query('SELECT count(id) as `exists` FROM note WHERE id = ?', [attachment.note_id]) }) .then((rows, fields) => { noteExists = (rows[0][0]['exists'] > 0) let url = attachment.url const noteId = attachment.note_id //Try to delete file and thumbnail try { fs.unlinkSync(filePath+attachment.file_location) } catch(err) { console.error('File Does not exist') } try { fs.unlinkSync(filePath+'thumb_'+attachment.file_location) } catch(err) { console.error('Thumbnail Does not exist') } //Do not delete link attachments, just hide them. They will be deleted if removed from note or if note is deleted if(attachment.attachment_type == 1 && !urlDelete && noteExists){ db.promise() .query(`UPDATE attachment SET visible = 0 WHERE id = ?`, [attachmentId]) .then((rows, fields) => resolve(true)) .catch(console.log) return resolve(true) } else { db.promise() .query(`DELETE FROM attachment WHERE id = ?`, [attachmentId]) .then((rows, fields) => resolve(true)) .catch(console.log) } }) .catch(console.log) }) } Attachment.processUploadedFile = (userId, noteId, fileObject) => { return new Promise((resolve, reject) => { const rawFilename = fileObject.filename const extension = '.'+fileObject.originalname.split('.').pop() const goodFileName = rawFilename+extension const fileName = fileObject.originalname //Actual name of the file, dog.jpg //Rename random file name to one with an extension fs.rename(filePath+rawFilename, filePath+goodFileName, (err) => { const created = Math.round((+new Date)/1000) db.promise() .query(` INSERT INTO attachment (note_id, user_id, attachment_type, \`text\`, last_indexed, file_location) VALUES (?, ?, ?, ?, ?, ?) `, [noteId, userId, 2, 'Add a description to -> '+fileName, created, goodFileName]) .then((rows, fields) => { Attachment.generateThumbnail(goodFileName) //If its an image, scrape text if(true){ // https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality //psm 3 - default, 11 - as much text as possible const config = { lang: "eng", oem: 1, psm: 3 } tesseract.recognize(filePath+goodFileName, config) .then(text => { text = text.slice(0, -1).trim() if(text.length > 5){ console.log('Inserting text') db.promise().query( `UPDATE attachment SET text = ? WHERE id = ? AND user_id = ? LIMIT 1`, [text, rows[0].insertId, userId] ).then(results => { resolve({ fileName, goodFileName }) }) } else { return resolve({ fileName, goodFileName }) } }) .catch(error => { console.log(error.message) }) } else { resolve({ fileName, goodFileName }) } }) .catch(console.log) }) }) } Attachment.generateThumbnail = (fileName) => { return new Promise((resolve, reject) => { gm(filePath+fileName) .resize(550) //Resize to width of 550 px .quality(75) //compression level 0 - 100 (best) .write(filePath + 'thumb_'+fileName, function (err) { resolve(fileName) }) }) } //Scans text for websites, returns all attachments Attachment.scanTextForWebsites = (io, userId, noteId, noteText) => { return new Promise((resolve, reject) => { let solrAttachmentText = '' //Final searchable scrape text for note if(noteText.length == 0){ resolve(solrAttachmentText) } Attachment.urlForNote(userId, noteId).then(attachments => { //Pull all the URLs out of the text let foundUrls = SiteScrape.getCleanUrls(noteText) //Go through each saved URL, remove new URLs from saved URLs //If a URL is not found, delete it attachments.forEach(attachment => { //URL already scraped, push text and continue let urlIndex = foundUrls.indexOf( attachment.url ) if(urlIndex != -1){ solrAttachmentText += attachment.text foundUrls.splice(urlIndex, 1) //Remove existing from set of found } else { //If existing attachment is not found in note, remove it Attachment.delete(userId, attachment.id, true) } }) //No newly scraped URLs, resolve with looked up attachment text if(foundUrls == null || foundUrls.length == 0){ return resolve(solrAttachmentText) } //Process the remaining URLs into attachments Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => { //Once everything is done being scraped, emit new attachment events SocketIo.to(userId).emit('update_counts') // Tell user to update attachments with scraped text SocketIo.to(userId).emit('update_note_attachments') solrAttachmentText += freshlyScrapedText resolve(solrAttachmentText) }) .catch(console.log) }) }) } //Return scraped text from each URL Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => { return new Promise((resolve, reject) => { if(foundUrls == null || foundUrls.length == 0){ return resolve('') } console.log('About to scrape') console.log(foundUrls) let processedCount = 0 let scrapedText = '' //Process each URL passd to function, a DB entry will be created for each scrape foundUrls.forEach(url => { Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => { scrapedText += freshlyScrapedText processedCount ++ //All URLs have been scraped, return data if(processedCount == foundUrls.length){ console.log('All urls scraped') return resolve(scrapedText) } }) .catch(error => { console.log('Site Scrape error', error) }) }) }) } Attachment.downloadFileFromUrl = (url) => { return new Promise((resolve, reject) => { if(!url){ return resolve(null) } const random = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15) let extension = '' let fileName = random+'_scrape' let thumbPath = 'thumb_'+fileName console.log('Scraping image url', url) console.log('Getting ready to scrape ', url) request(url) .on('error', error => { console.log(error) resolve(null) }) .on('response', res => { console.log(res.statusCode) console.log(res.headers['content-type']) //Get mime type from header content type // extension = '.'+String(res.headers['content-type']).split('/').pop() }) .pipe(fs.createWriteStream(filePath+thumbPath)) .on('close', () => { //resize image if its real big gm(filePath+thumbPath) .resize(550) //Resize to width of 550 px .quality(85) //compression level 0 - 100 (best) .write(filePath+thumbPath, function (err) { if(err){ console.log(err) return resolve(null) } console.log('Saved Image') return resolve(fileName) }) }) }) } Attachment.processUrl = (userId, noteId, url) => { const scrapeTime = 5*1000; return new Promise((resolve, reject) => { const options = { uri: url, simple: true, timeout: scrapeTime, headers: { 'User-Agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' //Simulate google headers }, transform: function (body) { return cheerio.load(body); } } let requestTimeout = null let thumbnail = null let request = null let created = Math.round((+new Date)/1000) let insertedId = null //Create a shell attachment for each URL, put in processing state db.promise() .query(`INSERT INTO attachment (note_id, user_id, attachment_type, text, url, last_indexed, file_location) VALUES (?, ?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, url, url, created, null]) .then((rows, fields) => { //Set two bigger variables then return request for processing request = rp(options) insertedId = rows[0].insertId return request }) .then($ => { //Clear timeout that would end this function clearTimeout(requestTimeout) // let header = $('h1').text().replace(removeWhitespace, " ") // desiredSearchText += header + "\n" const pageTitle = SiteScrape.getTitle($) const hostname = SiteScrape.getHostName(url) const thumbnail = SiteScrape.getDisplayImage($, url) const keywords = SiteScrape.getKeywords($) var desiredSearchText = '' desiredSearchText += pageTitle if(keywords){ desiredSearchText += "\n " + keywords } console.log('Results from site scrape-------------') console.log({ pageTitle, hostname, thumbnail, keywords }) // throw new Error('Ending this function early.') // console.log('TexT Scraped') // console.log(desiredSearchText) created = Math.round((+new Date)/1000) //Scrape URL for thumbnail - take filename and save in attachment Attachment.downloadFileFromUrl(thumbnail) .then(thumbnailFilename => { //Update text and thumbnail filename created = Math.round((+new Date)/1000) db.promise() .query(`UPDATE attachment SET text = ?, last_indexed = ?, file_location = ? WHERE id = ? `, [desiredSearchText, created, thumbnailFilename, insertedId]) .then((rows, fields) => { resolve(desiredSearchText) //Return found text }) .catch(console.log) //Create attachment in DB with scrape text and provided data // db.promise() // .query(`INSERT INTO attachment // (note_id, user_id, attachment_type, text, url, last_indexed, file_location) // VALUES (?, ?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created, thumbnailFilename]) // .then((rows, fields) => { // resolve(desiredSearchText) //Return found text // }) // .catch(console.log) }) }) .catch(error => { console.log('Scrape pooped out') console.log('Issue with scrape', error.statusCode) clearTimeout(requestTimeout) return resolve('No site text') }) requestTimeout = setTimeout( () => { console.log('Cancel the request, its taking to long.') request.cancel() return resolve('Request Timeout') }, scrapeTime ) }) } Attachment.generatePushKey = (userId) => { return new Promise((resolve, reject) => { db.promise() .query("SELECT pushkey FROM user WHERE id = ? LIMIT 1", [userId]) .then((rows, fields) => { const pushKey = rows[0][0].pushkey // push key exists if(pushKey && pushKey.length > 0){ return resolve(pushKey) } else { // generate and save a new key const newPushKey = cs.createSmallSalt() db.promise() .query('UPDATE user SET pushkey = ? WHERE id = ? LIMIT 1', [newPushKey,userId]) .then((rows, fields) => { return resolve(newPushKey) }) } }) }) } Attachment.deletePushKey = (userId) => { return new Promise((resolve, reject) => { db.promise() .query('UPDATE user SET pushkey = null WHERE id = ? LIMIT 1', [userId]) .then((rows, fields) => { return resolve(rows[0].affectedRows == 1) }) }) } Attachment.getPushkeyBookmarklet = (userId) => { return new Promise((resolve, reject) => { Attachment.generatePushKey(userId) .then( pushKey => { let bookmarklet = Attachment.generateBookmarkletText(pushKey) return resolve(bookmarklet) }) }) } Attachment.pushUrl = (pushkey,url) => { return new Promise((resolve, reject) => { let userId = null pushkey = pushkey.replace(/ /g, '+') db.promise() .query("SELECT id FROM user WHERE pushkey = ? LIMIT 1", [pushkey]) .then((rows, fields) => { if(rows[0].length == 0){ return resolve(true) } userId = rows[0][0].id return Attachment.scrapeUrlsCreateAttachments(userId, null, [url]) }) .then(() => { if(typeof SocketIo != 'undefined'){ //Once everything is done being scraped, emit new attachment events SocketIo.to(userId).emit('update_counts') // Tell user to update attachments with scraped text SocketIo.to(userId).emit('update_note_attachments') } return resolve(true) }) .catch(console.log) }) } Attachment.generateBookmarkletText = (pushKey) => { const endpoint = '/api/public/pushmebaby' let url = 'https://www.solidscribe.com' + endpoint if(process.env.NODE_ENV === 'development'){ // url = 'https://192.168.1.164' + endpoint } // Terminate each line with a semi-colon, super important, since spaces are removed. // document.getElementById(id).remove(); url += '?pushkey='+encodeURIComponent(pushKey) const bookmarkletV3 = ` javascript: (() => { var p = encodeURIComponent(window.location.href); var n = "`+url+`&url="+p; window.open(n, '_blank', 'noopener=noopener'); window.focus(); var k = document.createElement("div"); k.setAttribute("style", "position:fixed;right:10px;top:10px;z-index:222222;border-radius:4px;font-size:1.3em;padding:20px 15px;background: #8f51be;color:white;"); k.innerHTML = "Posted URL to your Solid Scribe account"; document.body.appendChild(k); setTimeout(()=>{ k.remove(); },5000); })(); ` return bookmarkletV3 .replace(/\t|\r|\n/gm, "") // Remove tabs, new lines, returns .replace(/\s+/g, ' ') // remove double spaces .trim() }