2019-07-24 11:06:50 -07:00
|
|
|
let db = require('@config/database')
|
|
|
|
|
|
|
|
let Attachment = module.exports = {}
|
|
|
|
|
2019-12-19 21:50:50 -08:00
|
|
|
const cheerio = require('cheerio')
|
|
|
|
const rp = require('request-promise')
|
|
|
|
const request = require('request')
|
|
|
|
const fs = require('fs')
|
2019-07-24 11:06:50 -07:00
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
const gm = require('gm')
|
|
|
|
|
|
|
|
const tesseract = require("node-tesseract-ocr")
|
|
|
|
const filePath = '../staticFiles/'
|
|
|
|
|
|
|
|
// Attachment.migrateOld
|
|
|
|
|
|
|
|
Attachment.textSearch = (userId, searchTerm) => {
|
2019-12-19 21:50:50 -08:00
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
const front = 5
|
|
|
|
const tail = 150
|
|
|
|
|
|
|
|
const query = `
|
|
|
|
SELECT
|
|
|
|
*,
|
|
|
|
substring(
|
|
|
|
text,
|
|
|
|
IF(LOCATE(?, text) > ${tail}, LOCATE(?, text) - ${front}, 1),
|
|
|
|
${tail} + LENGTH(?) + ${front}
|
|
|
|
) as snippet
|
|
|
|
FROM attachment
|
|
|
|
WHERE user_id = ?
|
|
|
|
AND MATCH(text)
|
|
|
|
AGAINST(? IN NATURAL LANGUAGE MODE)
|
|
|
|
LIMIT 1000`
|
|
|
|
|
|
|
|
db.promise()
|
|
|
|
.query(query, [searchTerm, searchTerm, searchTerm, userId, searchTerm])
|
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(rows[0]) //Return all attachments found by query
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2020-02-10 09:44:43 -08:00
|
|
|
Attachment.search = (userId, noteId, attachmentType) => {
|
2020-02-01 14:21:22 -08:00
|
|
|
return new Promise((resolve, reject) => {
|
2019-12-19 21:50:50 -08:00
|
|
|
|
|
|
|
let params = [userId]
|
2020-02-01 14:21:22 -08:00
|
|
|
let query = 'SELECT * FROM attachment WHERE user_id = ? AND visible = 1 '
|
2019-12-19 21:50:50 -08:00
|
|
|
|
|
|
|
if(noteId && noteId > 0){
|
|
|
|
query += 'AND note_id = ? '
|
|
|
|
params.push(noteId)
|
|
|
|
}
|
|
|
|
|
2020-02-10 09:44:43 -08:00
|
|
|
if(Number.isInteger(attachmentType)){
|
|
|
|
query += 'AND attachment_type = ? '
|
|
|
|
params.push(attachmentType)
|
|
|
|
}
|
|
|
|
|
2019-12-19 21:50:50 -08:00
|
|
|
query += 'ORDER BY last_indexed DESC '
|
|
|
|
|
|
|
|
db.promise()
|
|
|
|
.query(query, params)
|
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(rows[0]) //Return all attachments found by query
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
//Returns all attachments
|
2019-07-24 11:06:50 -07:00
|
|
|
Attachment.forNote = (userId, noteId) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
db.promise()
|
2020-02-01 14:21:22 -08:00
|
|
|
.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND visible = 1 ORDER BY last_indexed DESC;`, [userId, noteId])
|
2019-07-24 11:06:50 -07:00
|
|
|
.then((rows, fields) => {
|
2019-12-19 21:50:50 -08:00
|
|
|
resolve(rows[0]) //Return all attachments found by query
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
Attachment.urlForNote = (userId, noteId) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
db.promise()
|
|
|
|
.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1 ORDER BY last_indexed DESC;`, [userId, noteId])
|
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(rows[0]) //Return all attachments found by query
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
//Update attachment in database
|
|
|
|
Attachment.update = (userId, attachmentId, updatedText, noteId) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
db.promise()
|
|
|
|
.query(`UPDATE attachment SET text = ? WHERE id = ? AND user_id = ?`,
|
|
|
|
[updatedText, attachmentId, userId])
|
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(true)
|
2019-07-24 11:06:50 -07:00
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
Attachment.delete = (userId, attachmentId, urlDelete = false) => {
|
|
|
|
|
2019-07-24 11:06:50 -07:00
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
db.promise()
|
2020-02-01 14:21:22 -08:00
|
|
|
.query('SELECT * FROM attachment WHERE id = ? AND user_id = ? LIMIT 1', [attachmentId, userId])
|
2019-07-24 11:06:50 -07:00
|
|
|
.then((rows, fields) => {
|
2020-02-01 14:21:22 -08:00
|
|
|
|
|
|
|
//Attachment doesn't exist, return done
|
|
|
|
if(rows[0].length == 0){
|
|
|
|
return resolve(true)
|
|
|
|
}
|
|
|
|
|
|
|
|
//Pull data we want out of
|
|
|
|
let row = rows[0][0]
|
|
|
|
let url = row.url
|
|
|
|
const noteId = row.note_id
|
|
|
|
|
|
|
|
//Try to delete file and thumbnail
|
|
|
|
try {
|
|
|
|
fs.unlinkSync(filePath+row.file_location)
|
|
|
|
} catch(err) { console.error('File Does not exist') }
|
|
|
|
try {
|
|
|
|
fs.unlinkSync(filePath+'thumb_'+row.file_location)
|
|
|
|
} catch(err) { console.error('Thumbnail Does not exist') }
|
|
|
|
|
2020-02-10 13:09:09 -08:00
|
|
|
//Do not delete link attachments, just hide them. They will be deleted if removed from note
|
|
|
|
if(row.attachment_type == 1 && !urlDelete){
|
|
|
|
db.promise()
|
|
|
|
.query(`UPDATE attachment SET visible = 0 WHERE id = ?`, [attachmentId])
|
|
|
|
.then((rows, fields) => { })
|
|
|
|
.catch(console.log)
|
2020-02-01 14:21:22 -08:00
|
|
|
|
|
|
|
return resolve(true)
|
2020-02-10 13:09:09 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
db.promise()
|
|
|
|
.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
|
|
|
|
.then((rows, fields) => { })
|
|
|
|
.catch(console.log)
|
|
|
|
|
|
|
|
return resolve(true)
|
2019-07-24 11:06:50 -07:00
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-12-19 21:50:50 -08:00
|
|
|
Attachment.processUploadedFile = (userId, noteId, fileObject) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
const rawFilename = fileObject.filename
|
|
|
|
const extension = '.'+fileObject.originalname.split('.').pop()
|
|
|
|
const goodFileName = rawFilename+extension
|
|
|
|
const fileName = fileObject.originalname //Actual name of the file, dog.jpg
|
2019-12-19 21:50:50 -08:00
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
//Rename random file name to one with an extension
|
|
|
|
fs.rename(filePath+rawFilename, filePath+goodFileName, (err) => {
|
|
|
|
|
|
|
|
const created = Math.round((+new Date)/1000)
|
|
|
|
|
|
|
|
db.promise()
|
|
|
|
.query(`
|
|
|
|
INSERT INTO attachment
|
|
|
|
(note_id, user_id, attachment_type, \`text\`, last_indexed, file_location)
|
|
|
|
VALUES
|
|
|
|
(?, ?, ?, ?, ?, ?)
|
|
|
|
`, [noteId, userId, 2, 'Add a description to -> '+fileName, created, goodFileName])
|
|
|
|
.then((rows, fields) => {
|
|
|
|
|
|
|
|
Attachment.generateThumbnail(goodFileName)
|
|
|
|
|
|
|
|
//If its an image, scrape text
|
|
|
|
if(true){
|
|
|
|
|
|
|
|
// https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality
|
|
|
|
//psm 3 - default, 11 - as much text as possible
|
|
|
|
const config = { lang: "eng", oem: 1, psm: 3 }
|
|
|
|
|
|
|
|
tesseract.recognize(filePath+goodFileName, config)
|
|
|
|
.then(text => {
|
|
|
|
|
|
|
|
text = text.slice(0, -1).trim()
|
|
|
|
|
|
|
|
if(text.length > 5){
|
|
|
|
console.log('Inserting text')
|
|
|
|
db.promise().query(
|
|
|
|
`UPDATE attachment SET text = ? WHERE id = ? AND user_id = ? LIMIT 1`,
|
|
|
|
[text, rows[0].insertId, userId]
|
|
|
|
).then(results => {
|
|
|
|
resolve({ fileName, goodFileName })
|
|
|
|
})
|
|
|
|
} else {
|
|
|
|
return resolve({ fileName, goodFileName })
|
|
|
|
}
|
|
|
|
|
|
|
|
})
|
|
|
|
.catch(error => {
|
|
|
|
console.log(error.message)
|
|
|
|
})
|
|
|
|
|
|
|
|
} else {
|
|
|
|
resolve({ fileName, goodFileName })
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
2019-12-19 21:50:50 -08:00
|
|
|
|
|
|
|
})
|
2020-02-01 14:21:22 -08:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
Attachment.generateThumbnail = (fileName) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
gm(filePath+fileName)
|
|
|
|
.resize(550) //Resize to width of 550 px
|
|
|
|
.quality(75) //compression level 0 - 100 (best)
|
|
|
|
.write(filePath + 'thumb_'+fileName, function (err) {
|
|
|
|
resolve(fileName)
|
|
|
|
})
|
2019-12-19 21:50:50 -08:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
//Scans text for websites, returns all attachments
|
2020-02-13 17:08:46 -08:00
|
|
|
Attachment.scanTextForWebsites = (io, userId, noteId, noteText) => {
|
2019-07-24 11:06:50 -07:00
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
|
|
|
let solrAttachmentText = '' //Final searchable scrape text for note
|
|
|
|
|
|
|
|
if(noteText.length == 0){ resolve(solrAttachmentText) }
|
|
|
|
|
2019-12-19 21:50:50 -08:00
|
|
|
Attachment.urlForNote(userId, noteId).then(attachments => {
|
2019-07-24 11:06:50 -07:00
|
|
|
|
|
|
|
//Find all URLs in text
|
2020-01-02 17:26:55 -08:00
|
|
|
//@TODO - Use the process text library for this function
|
2019-07-24 11:06:50 -07:00
|
|
|
const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
|
2019-07-29 00:22:47 -07:00
|
|
|
let allUrls = noteText.match(urlPattern)
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
if(allUrls == null){
|
|
|
|
allUrls = []
|
|
|
|
}
|
|
|
|
|
2020-02-13 17:08:46 -08:00
|
|
|
//Every URL needs HTTPS!!!
|
2020-02-01 14:21:22 -08:00
|
|
|
let foundUrls = []
|
|
|
|
allUrls.forEach( (item, index) => {
|
|
|
|
//Every URL should have HTTPS
|
|
|
|
if(item.indexOf('https://') == -1 && item.indexOf('http://') == -1){
|
|
|
|
allUrls[index] = 'https://'+item
|
|
|
|
}
|
|
|
|
//URLs should all have HTTPS!!!
|
|
|
|
if(item.indexOf('http://') >= 0){
|
|
|
|
allUrls[index] = item.replace('http://','https://')
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
2019-07-29 00:22:47 -07:00
|
|
|
//Remove all duplicates
|
2020-02-01 14:21:22 -08:00
|
|
|
foundUrls = [...new Set(allUrls)]
|
2019-07-24 11:06:50 -07:00
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
|
|
|
|
|
|
|
|
//Go through each saved URL, remove new URLs from saved URLs
|
|
|
|
//If a URL is not found, delete it
|
2019-07-24 11:06:50 -07:00
|
|
|
attachments.forEach(attachment => {
|
|
|
|
//URL already scraped, push text and continue
|
|
|
|
let urlIndex = foundUrls.indexOf( attachment.url )
|
|
|
|
|
|
|
|
if(urlIndex != -1){
|
|
|
|
solrAttachmentText += attachment.text
|
|
|
|
foundUrls.splice(urlIndex, 1) //Remove existing from set of found
|
|
|
|
} else {
|
2019-12-19 21:50:50 -08:00
|
|
|
//If existing attachment is not found in note, remove it
|
2020-02-01 14:21:22 -08:00
|
|
|
Attachment.delete(userId, attachment.id, true)
|
2019-07-24 11:06:50 -07:00
|
|
|
}
|
|
|
|
})
|
|
|
|
|
|
|
|
//No newly scraped URLs, resolve with looked up attachment text
|
2019-07-29 00:22:47 -07:00
|
|
|
if(foundUrls == null || foundUrls.length == 0){
|
2020-02-13 17:08:46 -08:00
|
|
|
return resolve(solrAttachmentText)
|
2019-07-24 11:06:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
//Process the remaining URLs into attachments
|
|
|
|
Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {
|
|
|
|
|
2020-02-13 17:08:46 -08:00
|
|
|
//Once everything is done being scraped, emit new attachment events
|
|
|
|
if(io){
|
|
|
|
io.to(userId).emit('update_counts')
|
|
|
|
}
|
|
|
|
|
2019-07-24 11:06:50 -07:00
|
|
|
solrAttachmentText += freshlyScrapedText
|
|
|
|
resolve(solrAttachmentText)
|
|
|
|
})
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
//Return scraped text from each URL
|
|
|
|
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
2019-12-19 21:50:50 -08:00
|
|
|
if(foundUrls == null || foundUrls.length == 0){
|
|
|
|
return resolve('')
|
|
|
|
}
|
|
|
|
|
2019-07-24 11:06:50 -07:00
|
|
|
console.log('About to scrape')
|
|
|
|
console.log(foundUrls)
|
|
|
|
|
|
|
|
let processedCount = 0
|
|
|
|
let scrapedText = ''
|
|
|
|
|
|
|
|
//Process each URL passd to function, a DB entry will be created for each scrape
|
|
|
|
foundUrls.forEach(url => {
|
|
|
|
Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {
|
|
|
|
|
|
|
|
scrapedText += freshlyScrapedText
|
|
|
|
processedCount ++
|
|
|
|
|
|
|
|
//All URLs have been scraped, return data
|
|
|
|
if(processedCount == foundUrls.length){
|
|
|
|
resolve(scrapedText)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-12-19 21:50:50 -08:00
|
|
|
Attachment.downloadFileFromUrl = (url) => {
|
|
|
|
|
|
|
|
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
if(url == null){
|
|
|
|
resolve(null)
|
|
|
|
}
|
2019-12-19 21:50:50 -08:00
|
|
|
|
|
|
|
const random = Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15)
|
2020-02-01 14:21:22 -08:00
|
|
|
const extension = '.'+url.split('.').pop() //This is throwing an error
|
|
|
|
let fileName = random+'_scrape'+extension
|
|
|
|
const thumbPath = 'thumb_'+fileName
|
|
|
|
|
|
|
|
console.log('Scraping image url')
|
|
|
|
console.log(url)
|
2019-12-19 21:50:50 -08:00
|
|
|
|
|
|
|
console.log('Getting ready to scrape ', url)
|
|
|
|
|
|
|
|
request(url)
|
|
|
|
.on('error', error => {
|
|
|
|
console.log(error)
|
|
|
|
resolve(null)
|
|
|
|
})
|
|
|
|
.on('response', res => {
|
|
|
|
console.log(res.statusCode)
|
|
|
|
console.log(res.headers['content-type'])
|
|
|
|
})
|
2020-02-01 14:21:22 -08:00
|
|
|
.pipe(fs.createWriteStream(filePath+thumbPath))
|
2019-12-19 21:50:50 -08:00
|
|
|
.on('close', () => {
|
2020-02-01 14:21:22 -08:00
|
|
|
|
|
|
|
//resize image if its real big
|
|
|
|
gm(filePath+thumbPath)
|
|
|
|
.resize(550) //Resize to width of 550 px
|
|
|
|
.quality(75) //compression level 0 - 100 (best)
|
|
|
|
.write(filePath+thumbPath, function (err) {
|
|
|
|
if(err){ console.log(err) }
|
|
|
|
})
|
|
|
|
|
|
|
|
|
2019-12-19 21:50:50 -08:00
|
|
|
console.log('Saved Image')
|
2020-02-01 14:21:22 -08:00
|
|
|
resolve(fileName)
|
2019-12-19 21:50:50 -08:00
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-07-24 11:06:50 -07:00
|
|
|
Attachment.processUrl = (userId, noteId, url) => {
|
2019-12-19 21:50:50 -08:00
|
|
|
|
|
|
|
const scrapeTime = 20*1000;
|
|
|
|
|
2019-07-24 11:06:50 -07:00
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
|
|
|
const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
|
|
|
|
'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
|
|
|
|
'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
|
|
|
|
'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
|
|
|
|
'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
|
|
|
|
'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']
|
|
|
|
|
|
|
|
var removeWhitespace = /\s+/g
|
|
|
|
|
|
|
|
const options = {
|
|
|
|
uri: url,
|
2019-08-03 14:03:35 -07:00
|
|
|
simple: true,
|
2019-12-19 21:50:50 -08:00
|
|
|
timeout: scrapeTime,
|
2019-08-03 14:03:35 -07:00
|
|
|
headers: {
|
|
|
|
'User-Agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' //Simulate google headers
|
|
|
|
},
|
2019-07-24 11:06:50 -07:00
|
|
|
transform: function (body) {
|
|
|
|
return cheerio.load(body);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-03 14:03:35 -07:00
|
|
|
let requestTimeout = null
|
2019-12-19 21:50:50 -08:00
|
|
|
let thumbnail = null
|
2020-02-01 14:21:22 -08:00
|
|
|
let request = null
|
|
|
|
let created = Math.round((+new Date)/1000)
|
|
|
|
let insertedId = null
|
2019-08-03 14:03:35 -07:00
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
//Create a shell attachment for each URL, put in processing state
|
|
|
|
db.promise()
|
|
|
|
.query(`INSERT INTO attachment
|
|
|
|
(note_id, user_id, attachment_type, text, url, last_indexed, file_location)
|
|
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
|
|
|
[noteId, userId, 1, 'Processing...', url, created, null])
|
|
|
|
.then((rows, fields) => {
|
|
|
|
//Set two bigger variables then return request for processing
|
|
|
|
request = rp(options)
|
|
|
|
insertedId = rows[0].insertId
|
|
|
|
|
|
|
|
return request
|
|
|
|
})
|
2019-08-03 14:03:35 -07:00
|
|
|
.then($ => {
|
|
|
|
|
|
|
|
clearTimeout(requestTimeout)
|
2019-07-24 11:06:50 -07:00
|
|
|
|
|
|
|
var desiredSearchText = ''
|
|
|
|
|
|
|
|
let pageTitle = $('title').text().replace(removeWhitespace, " ")
|
|
|
|
desiredSearchText += pageTitle + "\n"
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
// let header = $('h1').text().replace(removeWhitespace, " ")
|
|
|
|
// desiredSearchText += header + "\n"
|
2019-07-24 11:06:50 -07:00
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
//Scrape metadata for page image
|
2019-12-19 21:50:50 -08:00
|
|
|
let metadata = $('meta[property="og:image"]')
|
|
|
|
if(metadata && metadata[0] && metadata[0].attribs){
|
|
|
|
thumbnail = metadata[0].attribs.content
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-07-24 11:06:50 -07:00
|
|
|
let majorContent = ''
|
|
|
|
majorContent += $('[class*=content]').text()
|
|
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
|
|
.substring(0,3000)
|
|
|
|
.toLowerCase()
|
|
|
|
majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
|
|
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
|
|
.substring(0,3000) //Limit characters
|
|
|
|
.toLowerCase()
|
|
|
|
|
|
|
|
//Count frequency of each word in scraped text
|
|
|
|
let frequency = {}
|
|
|
|
majorContent.split(' ').forEach(word => {
|
|
|
|
if(excludeWords.includes(word)){
|
|
|
|
return //Exclude certain words
|
|
|
|
}
|
|
|
|
if(!frequency[word]){
|
|
|
|
frequency[word] = 0
|
|
|
|
}
|
|
|
|
frequency[word]++
|
|
|
|
})
|
|
|
|
|
|
|
|
//Create a sortable array
|
|
|
|
var sortable = [];
|
|
|
|
for (var index in frequency) {
|
|
|
|
if(frequency[index] > 1){
|
|
|
|
sortable.push([index, frequency[index]]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//Sort them by most used words in the list
|
|
|
|
sortable.sort(function(a, b) {
|
|
|
|
return b[1] - a[1];
|
|
|
|
});
|
|
|
|
|
|
|
|
let finalWords = []
|
2020-02-01 14:21:22 -08:00
|
|
|
for(let i=0; i<5; i++){
|
2019-07-29 00:22:47 -07:00
|
|
|
if(sortable[i] && sortable[i][0]){
|
2019-07-24 11:06:50 -07:00
|
|
|
finalWords.push(sortable[i][0])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
if(finalWords.length > 0){
|
|
|
|
desiredSearchText += 'Keywords: ' + finalWords.join(', ')
|
|
|
|
}
|
2019-07-24 11:06:50 -07:00
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
|
|
|
|
// console.log('TexT Scraped')
|
|
|
|
// console.log(desiredSearchText)
|
|
|
|
|
|
|
|
created = Math.round((+new Date)/1000)
|
2019-07-24 11:06:50 -07:00
|
|
|
|
2019-12-19 21:50:50 -08:00
|
|
|
//Scrape URL for thumbnail - take filename and save in attachment
|
|
|
|
Attachment.downloadFileFromUrl(thumbnail)
|
|
|
|
.then(thumbnailFilename => {
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
//Update text and thumbnail filename
|
|
|
|
created = Math.round((+new Date)/1000)
|
2019-12-19 21:50:50 -08:00
|
|
|
db.promise()
|
2020-02-01 14:21:22 -08:00
|
|
|
.query(`UPDATE attachment SET
|
|
|
|
text = ?,
|
|
|
|
last_indexed = ?,
|
|
|
|
file_location = ?
|
|
|
|
WHERE id = ?
|
|
|
|
`, [desiredSearchText, created, thumbnailFilename, insertedId])
|
2019-12-19 21:50:50 -08:00
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(desiredSearchText) //Return found text
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
2020-02-01 14:21:22 -08:00
|
|
|
|
|
|
|
|
|
|
|
//Create attachment in DB with scrape text and provided data
|
|
|
|
// db.promise()
|
|
|
|
// .query(`INSERT INTO attachment
|
|
|
|
// (note_id, user_id, attachment_type, text, url, last_indexed, file_location)
|
|
|
|
// VALUES (?, ?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created, thumbnailFilename])
|
|
|
|
// .then((rows, fields) => {
|
|
|
|
|
|
|
|
// resolve(desiredSearchText) //Return found text
|
|
|
|
// })
|
|
|
|
// .catch(console.log)
|
2019-07-24 11:06:50 -07:00
|
|
|
})
|
|
|
|
|
|
|
|
})
|
2019-08-03 14:03:35 -07:00
|
|
|
.catch(error => {
|
|
|
|
console.log('Issue with scrape')
|
|
|
|
console.log(error)
|
|
|
|
resolve('')
|
|
|
|
})
|
|
|
|
|
|
|
|
requestTimeout = setTimeout( () => {
|
|
|
|
console.log('Cancel the request, its taking to long.')
|
|
|
|
request.cancel()
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
desiredSearchText = 'No Description for -> '+url
|
2019-08-03 14:03:35 -07:00
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
created = Math.round((+new Date)/1000)
|
2019-08-03 14:03:35 -07:00
|
|
|
db.promise()
|
2020-02-01 14:21:22 -08:00
|
|
|
.query(`UPDATE attachment SET
|
|
|
|
text = ?,
|
|
|
|
last_indexed = ?,
|
|
|
|
WHERE id = ?
|
|
|
|
`, [desiredSearchText, created, insertedId])
|
2019-08-03 14:03:35 -07:00
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(desiredSearchText) //Return found text
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
|
2020-02-01 14:21:22 -08:00
|
|
|
//Create attachment in DB with scrape text and provided data
|
|
|
|
// db.promise()
|
|
|
|
// .query(`INSERT INTO attachment
|
|
|
|
// (note_id, user_id, attachment_type, text, url, last_indexed)
|
|
|
|
// VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
|
|
|
|
// .then((rows, fields) => {
|
|
|
|
// resolve(desiredSearchText) //Return found text
|
|
|
|
// })
|
|
|
|
// .catch(console.log)
|
|
|
|
|
|
|
|
}, scrapeTime )
|
2019-07-24 11:06:50 -07:00
|
|
|
})
|
|
|
|
}
|