dd0205a3c1
Added request timeout to prevent long requests from holding up note saving Added header to request to try and simulate google crawler
230 lines
6.9 KiB
JavaScript
230 lines
6.9 KiB
JavaScript
let db = require('@config/database')
|
|
|
|
let Attachment = module.exports = {}
|
|
|
|
const cheerio = require('cheerio');
|
|
const rp = require('request-promise');
|
|
|
|
Attachment.forNote = (userId, noteId) => {
|
|
return new Promise((resolve, reject) => {
|
|
db.promise()
|
|
.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1;`, [userId, noteId])
|
|
.then((rows, fields) => {
|
|
resolve(rows[0]) //Return all tags found by query
|
|
})
|
|
.catch(console.log)
|
|
})
|
|
}
|
|
|
|
Attachment.delete = (attachmentId) => {
|
|
return new Promise((resolve, reject) => {
|
|
db.promise()
|
|
.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
|
|
.then((rows, fields) => {
|
|
resolve(rows[0]) //Return all tags found by query
|
|
})
|
|
.catch(console.log)
|
|
})
|
|
}
|
|
|
|
Attachment.scanTextForWebsites = (userId, noteId, noteText) => {
|
|
return new Promise((resolve, reject) => {
|
|
|
|
let solrAttachmentText = '' //Final searchable scrape text for note
|
|
|
|
if(noteText.length == 0){ resolve(solrAttachmentText) }
|
|
|
|
Attachment.forNote(userId, noteId).then(attachments => {
|
|
|
|
//Find all URLs in text
|
|
const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
|
|
let allUrls = noteText.match(urlPattern)
|
|
|
|
//Remove all duplicates
|
|
let foundUrls = [...new Set(allUrls)]
|
|
|
|
//Go through each attachment, check for existing URLs
|
|
attachments.forEach(attachment => {
|
|
//URL already scraped, push text and continue
|
|
let urlIndex = foundUrls.indexOf( attachment.url )
|
|
|
|
if(urlIndex != -1){
|
|
solrAttachmentText += attachment.text
|
|
foundUrls.splice(urlIndex, 1) //Remove existing from set of found
|
|
} else {
|
|
Attachment.delete(attachment.id)
|
|
}
|
|
})
|
|
|
|
//No newly scraped URLs, resolve with looked up attachment text
|
|
if(foundUrls == null || foundUrls.length == 0){
|
|
resolve(solrAttachmentText)
|
|
}
|
|
|
|
//Process the remaining URLs into attachments
|
|
Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {
|
|
|
|
solrAttachmentText += freshlyScrapedText
|
|
resolve(solrAttachmentText)
|
|
})
|
|
})
|
|
})
|
|
}
|
|
|
|
//Return scraped text from each URL
|
|
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
|
|
return new Promise((resolve, reject) => {
|
|
|
|
console.log('About to scrape')
|
|
console.log(foundUrls)
|
|
|
|
if(foundUrls == null || foundUrls.length == 0){resolve('')}
|
|
|
|
let processedCount = 0
|
|
let scrapedText = ''
|
|
|
|
//Process each URL passd to function, a DB entry will be created for each scrape
|
|
foundUrls.forEach(url => {
|
|
Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {
|
|
|
|
scrapedText += freshlyScrapedText
|
|
processedCount ++
|
|
|
|
//All URLs have been scraped, return data
|
|
if(processedCount == foundUrls.length){
|
|
resolve(scrapedText)
|
|
}
|
|
})
|
|
})
|
|
})
|
|
}
|
|
|
|
|
|
Attachment.processUrl = (userId, noteId, url) => {
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
|
|
'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
|
|
'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
|
|
'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
|
|
'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
|
|
'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']
|
|
|
|
var removeWhitespace = /\s+/g
|
|
|
|
|
|
// console.log('Scraping ', website)
|
|
const options = {
|
|
uri: url,
|
|
simple: true,
|
|
timeout: 1000 * 10, // 10 seconds
|
|
headers: {
|
|
'User-Agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' //Simulate google headers
|
|
},
|
|
transform: function (body) {
|
|
return cheerio.load(body);
|
|
}
|
|
}
|
|
|
|
let requestTimeout = null
|
|
|
|
let request = rp(options)
|
|
.then($ => {
|
|
|
|
clearTimeout(requestTimeout)
|
|
|
|
var desiredSearchText = ''
|
|
|
|
let pageTitle = $('title').text().replace(removeWhitespace, " ")
|
|
desiredSearchText += pageTitle + "\n"
|
|
|
|
let header = $('h1').text().replace(removeWhitespace, " ")
|
|
desiredSearchText += header + "\n"
|
|
|
|
let majorContent = ''
|
|
majorContent += $('[class*=content]').text()
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
.substring(0,3000)
|
|
.toLowerCase()
|
|
majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
.substring(0,3000) //Limit characters
|
|
.toLowerCase()
|
|
|
|
//Count frequency of each word in scraped text
|
|
let frequency = {}
|
|
majorContent.split(' ').forEach(word => {
|
|
if(excludeWords.includes(word)){
|
|
return //Exclude certain words
|
|
}
|
|
if(!frequency[word]){
|
|
frequency[word] = 0
|
|
}
|
|
frequency[word]++
|
|
})
|
|
|
|
//Create a sortable array
|
|
var sortable = [];
|
|
for (var index in frequency) {
|
|
if(frequency[index] > 1){
|
|
sortable.push([index, frequency[index]]);
|
|
}
|
|
}
|
|
|
|
//Sort them by most used words in the list
|
|
sortable.sort(function(a, b) {
|
|
return b[1] - a[1];
|
|
});
|
|
|
|
let finalWords = []
|
|
for(let i=0; i<15; i++){
|
|
if(sortable[i] && sortable[i][0]){
|
|
finalWords.push(sortable[i][0])
|
|
}
|
|
}
|
|
|
|
desiredSearchText += finalWords.join(', ')
|
|
console.log('TexT Scraped')
|
|
console.log(desiredSearchText)
|
|
|
|
const created = Math.round((+new Date)/1000)
|
|
|
|
//Create attachment in DB with scrape text and provided data
|
|
db.promise()
|
|
.query(`INSERT INTO attachment
|
|
(note_id, user_id, attachment_type, text, url, last_indexed)
|
|
VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
|
|
.then((rows, fields) => {
|
|
resolve(desiredSearchText) //Return found text
|
|
})
|
|
.catch(console.log)
|
|
|
|
})
|
|
.catch(error => {
|
|
console.log('Issue with scrape')
|
|
console.log(error)
|
|
resolve('')
|
|
})
|
|
|
|
requestTimeout = setTimeout( () => {
|
|
console.log('Cancel the request, its taking to long.')
|
|
request.cancel()
|
|
|
|
desiredSearchText = 'Unable to Scrape URL at this time'
|
|
const created = Math.round((+new Date)/1000)
|
|
|
|
//Create attachment in DB with scrape text and provided data
|
|
db.promise()
|
|
.query(`INSERT INTO attachment
|
|
(note_id, user_id, attachment_type, text, url, last_indexed)
|
|
VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
|
|
.then((rows, fields) => {
|
|
resolve(desiredSearchText) //Return found text
|
|
})
|
|
.catch(console.log)
|
|
|
|
}, (5000))
|
|
})
|
|
} |