SolidScribe/server/models/Attachment.js
Max G dd0205a3c1 Added a build script that will push newly build code to avid habit
Added request timeout to prevent long requests from holding up note saving
Added header to request to try and simulate google crawler
2019-08-03 21:03:35 +00:00

230 lines
6.9 KiB
JavaScript

let db = require('@config/database')
let Attachment = module.exports = {}
const cheerio = require('cheerio');
const rp = require('request-promise');
Attachment.forNote = (userId, noteId) => {
return new Promise((resolve, reject) => {
db.promise()
.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1;`, [userId, noteId])
.then((rows, fields) => {
resolve(rows[0]) //Return all tags found by query
})
.catch(console.log)
})
}
Attachment.delete = (attachmentId) => {
return new Promise((resolve, reject) => {
db.promise()
.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
.then((rows, fields) => {
resolve(rows[0]) //Return all tags found by query
})
.catch(console.log)
})
}
Attachment.scanTextForWebsites = (userId, noteId, noteText) => {
return new Promise((resolve, reject) => {
let solrAttachmentText = '' //Final searchable scrape text for note
if(noteText.length == 0){ resolve(solrAttachmentText) }
Attachment.forNote(userId, noteId).then(attachments => {
//Find all URLs in text
const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
let allUrls = noteText.match(urlPattern)
//Remove all duplicates
let foundUrls = [...new Set(allUrls)]
//Go through each attachment, check for existing URLs
attachments.forEach(attachment => {
//URL already scraped, push text and continue
let urlIndex = foundUrls.indexOf( attachment.url )
if(urlIndex != -1){
solrAttachmentText += attachment.text
foundUrls.splice(urlIndex, 1) //Remove existing from set of found
} else {
Attachment.delete(attachment.id)
}
})
//No newly scraped URLs, resolve with looked up attachment text
if(foundUrls == null || foundUrls.length == 0){
resolve(solrAttachmentText)
}
//Process the remaining URLs into attachments
Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {
solrAttachmentText += freshlyScrapedText
resolve(solrAttachmentText)
})
})
})
}
//Return scraped text from each URL
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
return new Promise((resolve, reject) => {
console.log('About to scrape')
console.log(foundUrls)
if(foundUrls == null || foundUrls.length == 0){resolve('')}
let processedCount = 0
let scrapedText = ''
//Process each URL passd to function, a DB entry will be created for each scrape
foundUrls.forEach(url => {
Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {
scrapedText += freshlyScrapedText
processedCount ++
//All URLs have been scraped, return data
if(processedCount == foundUrls.length){
resolve(scrapedText)
}
})
})
})
}
Attachment.processUrl = (userId, noteId, url) => {
return new Promise((resolve, reject) => {
const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']
var removeWhitespace = /\s+/g
// console.log('Scraping ', website)
const options = {
uri: url,
simple: true,
timeout: 1000 * 10, // 10 seconds
headers: {
'User-Agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' //Simulate google headers
},
transform: function (body) {
return cheerio.load(body);
}
}
let requestTimeout = null
let request = rp(options)
.then($ => {
clearTimeout(requestTimeout)
var desiredSearchText = ''
let pageTitle = $('title').text().replace(removeWhitespace, " ")
desiredSearchText += pageTitle + "\n"
let header = $('h1').text().replace(removeWhitespace, " ")
desiredSearchText += header + "\n"
let majorContent = ''
majorContent += $('[class*=content]').text()
.replace(removeWhitespace, " ") //Remove all whitespace
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
.substring(0,3000)
.toLowerCase()
majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
.replace(removeWhitespace, " ") //Remove all whitespace
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
.substring(0,3000) //Limit characters
.toLowerCase()
//Count frequency of each word in scraped text
let frequency = {}
majorContent.split(' ').forEach(word => {
if(excludeWords.includes(word)){
return //Exclude certain words
}
if(!frequency[word]){
frequency[word] = 0
}
frequency[word]++
})
//Create a sortable array
var sortable = [];
for (var index in frequency) {
if(frequency[index] > 1){
sortable.push([index, frequency[index]]);
}
}
//Sort them by most used words in the list
sortable.sort(function(a, b) {
return b[1] - a[1];
});
let finalWords = []
for(let i=0; i<15; i++){
if(sortable[i] && sortable[i][0]){
finalWords.push(sortable[i][0])
}
}
desiredSearchText += finalWords.join(', ')
console.log('TexT Scraped')
console.log(desiredSearchText)
const created = Math.round((+new Date)/1000)
//Create attachment in DB with scrape text and provided data
db.promise()
.query(`INSERT INTO attachment
(note_id, user_id, attachment_type, text, url, last_indexed)
VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
.then((rows, fields) => {
resolve(desiredSearchText) //Return found text
})
.catch(console.log)
})
.catch(error => {
console.log('Issue with scrape')
console.log(error)
resolve('')
})
requestTimeout = setTimeout( () => {
console.log('Cancel the request, its taking to long.')
request.cancel()
desiredSearchText = 'Unable to Scrape URL at this time'
const created = Math.round((+new Date)/1000)
//Create attachment in DB with scrape text and provided data
db.promise()
.query(`INSERT INTO attachment
(note_id, user_id, attachment_type, text, url, last_indexed)
VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
.then((rows, fields) => {
resolve(desiredSearchText) //Return found text
})
.catch(console.log)
}, (5000))
})
}