SolidScribe/server/models/Attachment.js

648 lines
17 KiB
JavaScript
Raw Permalink Normal View History

let db = require('@config/database')
let SiteScrape = require('@helpers/SiteScrape')
const cs = require('@helpers/CryptoString')
let Attachment = module.exports = {}
const cheerio = require('cheerio')
const rp = require('request-promise')
const request = require('request')
const fs = require('fs')
2020-02-01 14:21:22 -08:00
const gm = require('gm')
const tesseract = require("node-tesseract-ocr")
const filePath = '../staticFiles/'
// Attachment.migrateOld
Attachment.textSearch = (userId, searchTerm) => {
return new Promise((resolve, reject) => {
2020-02-01 14:21:22 -08:00
const front = 5
const tail = 150
const query = `
SELECT
*,
substring(
text,
IF(LOCATE(?, text) > ${tail}, LOCATE(?, text) - ${front}, 1),
${tail} + LENGTH(?) + ${front}
) as snippet
FROM attachment
WHERE user_id = ?
AND visible != 0
2020-02-01 14:21:22 -08:00
AND MATCH(text)
AGAINST(? IN NATURAL LANGUAGE MODE)
LIMIT 1000`
db.promise()
.query(query, [searchTerm, searchTerm, searchTerm, userId, searchTerm])
.then((rows, fields) => {
resolve(rows[0]) //Return all attachments found by query
})
.catch(console.log)
})
}
Attachment.search = (userId, noteId, attachmentType, offset, setSize, includeShared) => {
console.log([userId, noteId, attachmentType, offset, setSize, includeShared])
2020-02-01 14:21:22 -08:00
return new Promise((resolve, reject) => {
let params = [userId]
let query = `
SELECT attachment.*, note.share_user_id FROM attachment
LEFT JOIN note ON (attachment.note_id = note.id)
WHERE attachment.user_id = ? AND visible = 1
`
if(noteId && noteId > 0){
//
// Show everything if note ID is present
//
query += 'AND attachment.note_id = ? '
params.push(noteId)
} else {
//
// Other filters if NO note id
//
if(attachmentType == 'links'){
query += 'AND attachment_type = 1 '
}
if(attachmentType == 'files'){
query += 'AND attachment_type > 1 '
}
query += `AND note.archived = ${ attachmentType == 'archived' ? '1':'0' } `
query += `AND note.trashed = ${ attachmentType == 'trashed' ? '1':'0' } `
if(!attachmentType){
// Null note ID means it was pushed by bookmarklet
query += 'OR attachment.note_id IS NULL '
}
}
if(!noteId){
const sharedOrNot = includeShared ? ' NOT ':' '
query += `AND note.share_user_id IS${sharedOrNot}NULL `
}
query += 'ORDER BY last_indexed DESC '
const limitOffset = parseInt(offset, 10) || 0 //Either parse int, or use zero
const parsedSetSize = parseInt(setSize, 10) || 20
query += ` LIMIT ${limitOffset}, ${parsedSetSize}`
console.log(query)
db.promise()
.query(query, params)
.then((rows, fields) => {
resolve(rows[0]) //Return all attachments found by query
})
.catch(console.log)
})
}
Attachment.urlForNote = (userId, noteId) => {
return new Promise((resolve, reject) => {
db.promise()
.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1 ORDER BY last_indexed DESC;`, [userId, noteId])
.then((rows, fields) => {
resolve(rows[0]) //Return all attachments found by query
})
.catch(console.log)
})
}
//Update attachment in database
Attachment.update = (userId, attachmentId, updatedText, noteId) => {
return new Promise((resolve, reject) => {
db.promise()
.query(`UPDATE attachment SET text = ? WHERE id = ? AND user_id = ?`,
[updatedText, attachmentId, userId])
.then((rows, fields) => {
resolve(true)
})
.catch(console.log)
})
}
2020-02-01 14:21:22 -08:00
Attachment.delete = (userId, attachmentId, urlDelete = false) => {
let attachment = null
let noteExists = true
return new Promise((resolve, reject) => {
db.promise()
2020-02-01 14:21:22 -08:00
.query('SELECT * FROM attachment WHERE id = ? AND user_id = ? LIMIT 1', [attachmentId, userId])
.then((rows, fields) => {
2020-02-01 14:21:22 -08:00
//Attachment doesn't exist, return done
if(rows[0].length == 0){
return resolve(true)
}
attachment = rows[0][0]
return db.promise().query('SELECT count(id) as `exists` FROM note WHERE id = ?', [attachment.note_id])
})
.then((rows, fields) => {
noteExists = (rows[0][0]['exists'] > 0)
let url = attachment.url
const noteId = attachment.note_id
2020-02-01 14:21:22 -08:00
//Try to delete file and thumbnail
try {
fs.unlinkSync(filePath+attachment.file_location)
2020-02-01 14:21:22 -08:00
} catch(err) { console.error('File Does not exist') }
try {
fs.unlinkSync(filePath+'thumb_'+attachment.file_location)
2020-02-01 14:21:22 -08:00
} catch(err) { console.error('Thumbnail Does not exist') }
//Do not delete link attachments, just hide them. They will be deleted if removed from note or if note is deleted
if(attachment.attachment_type == 1 && !urlDelete && noteExists){
db.promise()
.query(`UPDATE attachment SET visible = 0 WHERE id = ?`, [attachmentId])
.then((rows, fields) => resolve(true))
.catch(console.log)
2020-02-01 14:21:22 -08:00
return resolve(true)
} else {
db.promise()
.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
.then((rows, fields) => resolve(true))
.catch(console.log)
}
})
.catch(console.log)
})
}
Attachment.processUploadedFile = (userId, noteId, fileObject) => {
return new Promise((resolve, reject) => {
2020-02-01 14:21:22 -08:00
const rawFilename = fileObject.filename
const extension = '.'+fileObject.originalname.split('.').pop()
const goodFileName = rawFilename+extension
const fileName = fileObject.originalname //Actual name of the file, dog.jpg
2020-02-01 14:21:22 -08:00
//Rename random file name to one with an extension
fs.rename(filePath+rawFilename, filePath+goodFileName, (err) => {
const created = Math.round((+new Date)/1000)
db.promise()
.query(`
INSERT INTO attachment
(note_id, user_id, attachment_type, \`text\`, last_indexed, file_location)
VALUES
(?, ?, ?, ?, ?, ?)
`, [noteId, userId, 2, 'Add a description to -> '+fileName, created, goodFileName])
.then((rows, fields) => {
Attachment.generateThumbnail(goodFileName)
//If its an image, scrape text
if(true){
// https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality
//psm 3 - default, 11 - as much text as possible
const config = { lang: "eng", oem: 1, psm: 3 }
tesseract.recognize(filePath+goodFileName, config)
.then(text => {
text = text.slice(0, -1).trim()
if(text.length > 5){
console.log('Inserting text')
db.promise().query(
`UPDATE attachment SET text = ? WHERE id = ? AND user_id = ? LIMIT 1`,
[text, rows[0].insertId, userId]
).then(results => {
resolve({ fileName, goodFileName })
})
} else {
return resolve({ fileName, goodFileName })
}
})
.catch(error => {
console.log(error.message)
})
} else {
resolve({ fileName, goodFileName })
}
})
.catch(console.log)
})
2020-02-01 14:21:22 -08:00
})
}
Attachment.generateThumbnail = (fileName) => {
return new Promise((resolve, reject) => {
gm(filePath+fileName)
.resize(550) //Resize to width of 550 px
.quality(75) //compression level 0 - 100 (best)
.write(filePath + 'thumb_'+fileName, function (err) {
resolve(fileName)
})
})
}
//Scans text for websites, returns all attachments
Attachment.scanTextForWebsites = (io, userId, noteId, noteText) => {
return new Promise((resolve, reject) => {
let solrAttachmentText = '' //Final searchable scrape text for note
if(noteText.length == 0){ resolve(solrAttachmentText) }
Attachment.urlForNote(userId, noteId).then(attachments => {
//Pull all the URLs out of the text
let foundUrls = SiteScrape.getCleanUrls(noteText)
2020-02-01 14:21:22 -08:00
//Go through each saved URL, remove new URLs from saved URLs
//If a URL is not found, delete it
attachments.forEach(attachment => {
//URL already scraped, push text and continue
let urlIndex = foundUrls.indexOf( attachment.url )
if(urlIndex != -1){
solrAttachmentText += attachment.text
foundUrls.splice(urlIndex, 1) //Remove existing from set of found
} else {
//If existing attachment is not found in note, remove it
2020-02-01 14:21:22 -08:00
Attachment.delete(userId, attachment.id, true)
}
})
//No newly scraped URLs, resolve with looked up attachment text
if(foundUrls == null || foundUrls.length == 0){
return resolve(solrAttachmentText)
}
//Process the remaining URLs into attachments
Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {
//Once everything is done being scraped, emit new attachment events
SocketIo.to(userId).emit('update_counts')
// Tell user to update attachments with scraped text
SocketIo.to(userId).emit('update_note_attachments')
solrAttachmentText += freshlyScrapedText
resolve(solrAttachmentText)
})
.catch(console.log)
})
})
}
//Return scraped text from each URL
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
return new Promise((resolve, reject) => {
if(foundUrls == null || foundUrls.length == 0){
return resolve('')
}
console.log('About to scrape')
console.log(foundUrls)
let processedCount = 0
let scrapedText = ''
//Process each URL passd to function, a DB entry will be created for each scrape
foundUrls.forEach(url => {
Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {
scrapedText += freshlyScrapedText
processedCount ++
//All URLs have been scraped, return data
if(processedCount == foundUrls.length){
console.log('All urls scraped')
return resolve(scrapedText)
}
})
.catch(error => {
console.log('Site Scrape error', error)
})
})
})
}
Attachment.downloadFileFromUrl = (url) => {
return new Promise((resolve, reject) => {
if(!url){
return resolve(null)
2020-02-01 14:21:22 -08:00
}
const random = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15)
let extension = ''
let fileName = random+'_scrape'
let thumbPath = 'thumb_'+fileName
2020-02-01 14:21:22 -08:00
console.log('Scraping image url', url)
console.log('Getting ready to scrape ', url)
request(url)
.on('error', error => {
console.log(error)
resolve(null)
})
.on('response', res => {
console.log(res.statusCode)
console.log(res.headers['content-type'])
//Get mime type from header content type
// extension = '.'+String(res.headers['content-type']).split('/').pop()
})
2020-02-01 14:21:22 -08:00
.pipe(fs.createWriteStream(filePath+thumbPath))
.on('close', () => {
2020-02-01 14:21:22 -08:00
//resize image if its real big
gm(filePath+thumbPath)
.resize(550) //Resize to width of 550 px
.quality(85) //compression level 0 - 100 (best)
2020-02-01 14:21:22 -08:00
.write(filePath+thumbPath, function (err) {
if(err){
console.log(err)
return resolve(null)
}
2020-02-01 14:21:22 -08:00
console.log('Saved Image')
return resolve(fileName)
})
2020-02-01 14:21:22 -08:00
})
})
}
Attachment.processUrl = (userId, noteId, url) => {
const scrapeTime = 5*1000;
return new Promise((resolve, reject) => {
const options = {
uri: url,
simple: true,
timeout: scrapeTime,
headers: {
'User-Agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' //Simulate google headers
},
transform: function (body) {
return cheerio.load(body);
}
}
let requestTimeout = null
let thumbnail = null
2020-02-01 14:21:22 -08:00
let request = null
let created = Math.round((+new Date)/1000)
let insertedId = null
2020-02-01 14:21:22 -08:00
//Create a shell attachment for each URL, put in processing state
db.promise()
.query(`INSERT INTO attachment
(note_id, user_id, attachment_type, text, url, last_indexed, file_location)
VALUES (?, ?, ?, ?, ?, ?, ?)`,
[noteId, userId, 1, url, url, created, null])
2020-02-01 14:21:22 -08:00
.then((rows, fields) => {
//Set two bigger variables then return request for processing
request = rp(options)
insertedId = rows[0].insertId
return request
})
.then($ => {
//Clear timeout that would end this function
clearTimeout(requestTimeout)
2020-02-01 14:21:22 -08:00
// let header = $('h1').text().replace(removeWhitespace, " ")
// desiredSearchText += header + "\n"
const pageTitle = SiteScrape.getTitle($)
const hostname = SiteScrape.getHostName(url)
const thumbnail = SiteScrape.getDisplayImage($, url)
const keywords = SiteScrape.getKeywords($)
var desiredSearchText = ''
desiredSearchText += pageTitle
if(keywords){
desiredSearchText += "\n " + keywords
}
console.log('Results from site scrape-------------')
console.log({
pageTitle,
hostname,
thumbnail,
keywords
})
// throw new Error('Ending this function early.')
2020-02-01 14:21:22 -08:00
// console.log('TexT Scraped')
// console.log(desiredSearchText)
created = Math.round((+new Date)/1000)
//Scrape URL for thumbnail - take filename and save in attachment
Attachment.downloadFileFromUrl(thumbnail)
.then(thumbnailFilename => {
2020-02-01 14:21:22 -08:00
//Update text and thumbnail filename
created = Math.round((+new Date)/1000)
db.promise()
2020-02-01 14:21:22 -08:00
.query(`UPDATE attachment SET
text = ?,
last_indexed = ?,
file_location = ?
WHERE id = ?
`, [desiredSearchText, created, thumbnailFilename, insertedId])
.then((rows, fields) => {
resolve(desiredSearchText) //Return found text
})
.catch(console.log)
2020-02-01 14:21:22 -08:00
//Create attachment in DB with scrape text and provided data
// db.promise()
// .query(`INSERT INTO attachment
// (note_id, user_id, attachment_type, text, url, last_indexed, file_location)
// VALUES (?, ?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created, thumbnailFilename])
// .then((rows, fields) => {
// resolve(desiredSearchText) //Return found text
// })
// .catch(console.log)
})
})
.catch(error => {
console.log('Scrape pooped out')
console.log('Issue with scrape', error.statusCode)
clearTimeout(requestTimeout)
return resolve('No site text')
})
requestTimeout = setTimeout( () => {
console.log('Cancel the request, its taking to long.')
request.cancel()
return resolve('Request Timeout')
}, scrapeTime )
})
}
Attachment.generatePushKey = (userId) => {
return new Promise((resolve, reject) => {
db.promise()
.query("SELECT pushkey FROM user WHERE id = ? LIMIT 1", [userId])
.then((rows, fields) => {
const pushKey = rows[0][0].pushkey
// push key exists
if(pushKey && pushKey.length > 0){
2020-02-01 14:21:22 -08:00
return resolve(pushKey)
} else {
// generate and save a new key
const newPushKey = cs.createSmallSalt()
db.promise()
.query('UPDATE user SET pushkey = ? WHERE id = ? LIMIT 1', [newPushKey,userId])
.then((rows, fields) => {
return resolve(newPushKey)
})
}
})
})
2023-07-23 16:13:28 -07:00
}
Attachment.deletePushKey = (userId) => {
return new Promise((resolve, reject) => {
db.promise()
.query('UPDATE user SET pushkey = null WHERE id = ? LIMIT 1', [userId])
.then((rows, fields) => {
return resolve(rows[0].affectedRows == 1)
})
})
}
Attachment.getPushkeyBookmarklet = (userId) => {
return new Promise((resolve, reject) => {
2023-07-23 16:13:28 -07:00
Attachment.generatePushKey(userId)
.then( pushKey => {
2023-07-23 16:13:28 -07:00
let bookmarklet = Attachment.generateBookmarkletText(pushKey)
return resolve(bookmarklet)
2023-07-23 16:13:28 -07:00
})
})
}
Attachment.pushUrl = (pushkey,url) => {
return new Promise((resolve, reject) => {
let userId = null
pushkey = pushkey.replace(/ /g, '+')
db.promise()
.query("SELECT id FROM user WHERE pushkey = ? LIMIT 1", [pushkey])
.then((rows, fields) => {
if(rows[0].length == 0){
return resolve(true)
}
userId = rows[0][0].id
return Attachment.scrapeUrlsCreateAttachments(userId, null, [url])
})
.then(() => {
if(typeof SocketIo != 'undefined'){
//Once everything is done being scraped, emit new attachment events
SocketIo.to(userId).emit('update_counts')
// Tell user to update attachments with scraped text
SocketIo.to(userId).emit('update_note_attachments')
}
return resolve(true)
})
.catch(console.log)
})
}
Attachment.generateBookmarkletText = (pushKey) => {
const endpoint = '/api/public/pushmebaby'
let url = 'https://www.solidscribe.com' + endpoint
if(process.env.NODE_ENV === 'development'){
// url = 'https://192.168.1.164' + endpoint
}
// Terminate each line with a semi-colon, super important, since spaces are removed.
// document.getElementById(id).remove();
url += '?pushkey='+encodeURIComponent(pushKey)
const bookmarkletV3 = `
javascript: (() => {
var p = encodeURIComponent(window.location.href);
var n = "`+url+`&url="+p;
window.open(n, '_blank', 'noopener=noopener');
window.focus();
var k = document.createElement("div");
k.setAttribute("style", "position:fixed;right:10px;top:10px;z-index:222222;border-radius:4px;font-size:1.3em;padding:20px 15px;background: #8f51be;color:white;");
k.innerHTML = "Posted URL to your Solid Scribe account";
document.body.appendChild(k);
setTimeout(()=>{
k.remove();
},5000);
})();
`
return bookmarkletV3
.replace(/\t|\r|\n/gm, "") // Remove tabs, new lines, returns
.replace(/\s+/g, ' ') // remove double spaces
.trim()
}