2019-07-24 11:06:50 -07:00
|
|
|
let db = require('@config/database')
|
|
|
|
|
|
|
|
let Attachment = module.exports = {}
|
|
|
|
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const rp = require('request-promise');
|
|
|
|
|
|
|
|
Attachment.forNote = (userId, noteId) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
db.promise()
|
|
|
|
.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1;`, [userId, noteId])
|
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(rows[0]) //Return all tags found by query
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
Attachment.delete = (attachmentId) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
db.promise()
|
|
|
|
.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
|
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(rows[0]) //Return all tags found by query
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
Attachment.scanTextForWebsites = (userId, noteId, noteText) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
|
|
|
let solrAttachmentText = '' //Final searchable scrape text for note
|
|
|
|
|
|
|
|
if(noteText.length == 0){ resolve(solrAttachmentText) }
|
|
|
|
|
|
|
|
Attachment.forNote(userId, noteId).then(attachments => {
|
|
|
|
|
|
|
|
//Find all URLs in text
|
|
|
|
const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
|
2019-07-29 00:22:47 -07:00
|
|
|
let allUrls = noteText.match(urlPattern)
|
|
|
|
|
|
|
|
//Remove all duplicates
|
|
|
|
let foundUrls = [...new Set(allUrls)]
|
2019-07-24 11:06:50 -07:00
|
|
|
|
|
|
|
//Go through each attachment, check for existing URLs
|
|
|
|
attachments.forEach(attachment => {
|
|
|
|
//URL already scraped, push text and continue
|
|
|
|
let urlIndex = foundUrls.indexOf( attachment.url )
|
|
|
|
|
|
|
|
if(urlIndex != -1){
|
|
|
|
solrAttachmentText += attachment.text
|
|
|
|
foundUrls.splice(urlIndex, 1) //Remove existing from set of found
|
|
|
|
} else {
|
|
|
|
Attachment.delete(attachment.id)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
|
|
|
//No newly scraped URLs, resolve with looked up attachment text
|
2019-07-29 00:22:47 -07:00
|
|
|
if(foundUrls == null || foundUrls.length == 0){
|
2019-07-24 11:06:50 -07:00
|
|
|
resolve(solrAttachmentText)
|
|
|
|
}
|
|
|
|
|
|
|
|
//Process the remaining URLs into attachments
|
|
|
|
Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {
|
|
|
|
|
|
|
|
solrAttachmentText += freshlyScrapedText
|
|
|
|
resolve(solrAttachmentText)
|
|
|
|
})
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
//Return scraped text from each URL
|
|
|
|
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
|
|
|
console.log('About to scrape')
|
|
|
|
console.log(foundUrls)
|
|
|
|
|
|
|
|
if(foundUrls == null || foundUrls.length == 0){resolve('')}
|
|
|
|
|
|
|
|
let processedCount = 0
|
|
|
|
let scrapedText = ''
|
|
|
|
|
|
|
|
//Process each URL passd to function, a DB entry will be created for each scrape
|
|
|
|
foundUrls.forEach(url => {
|
|
|
|
Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {
|
|
|
|
|
|
|
|
scrapedText += freshlyScrapedText
|
|
|
|
processedCount ++
|
|
|
|
|
|
|
|
//All URLs have been scraped, return data
|
|
|
|
if(processedCount == foundUrls.length){
|
|
|
|
resolve(scrapedText)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Attachment.processUrl = (userId, noteId, url) => {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
|
|
|
const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
|
|
|
|
'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
|
|
|
|
'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
|
|
|
|
'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
|
|
|
|
'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
|
|
|
|
'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']
|
|
|
|
|
|
|
|
var removeWhitespace = /\s+/g
|
|
|
|
|
|
|
|
// console.log('Scraping ', website)
|
|
|
|
const options = {
|
|
|
|
uri: url,
|
|
|
|
transform: function (body) {
|
|
|
|
return cheerio.load(body);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
rp(options).then($ => {
|
|
|
|
|
|
|
|
var desiredSearchText = ''
|
|
|
|
|
|
|
|
let pageTitle = $('title').text().replace(removeWhitespace, " ")
|
|
|
|
desiredSearchText += pageTitle + "\n"
|
|
|
|
|
|
|
|
let header = $('h1').text().replace(removeWhitespace, " ")
|
|
|
|
desiredSearchText += header + "\n"
|
|
|
|
|
|
|
|
let majorContent = ''
|
|
|
|
majorContent += $('[class*=content]').text()
|
|
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
|
|
.substring(0,3000)
|
|
|
|
.toLowerCase()
|
|
|
|
majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
|
|
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
|
|
.substring(0,3000) //Limit characters
|
|
|
|
.toLowerCase()
|
|
|
|
|
|
|
|
//Count frequency of each word in scraped text
|
|
|
|
let frequency = {}
|
|
|
|
majorContent.split(' ').forEach(word => {
|
|
|
|
if(excludeWords.includes(word)){
|
|
|
|
return //Exclude certain words
|
|
|
|
}
|
|
|
|
if(!frequency[word]){
|
|
|
|
frequency[word] = 0
|
|
|
|
}
|
|
|
|
frequency[word]++
|
|
|
|
})
|
|
|
|
|
|
|
|
//Create a sortable array
|
|
|
|
var sortable = [];
|
|
|
|
for (var index in frequency) {
|
|
|
|
if(frequency[index] > 1){
|
|
|
|
sortable.push([index, frequency[index]]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//Sort them by most used words in the list
|
|
|
|
sortable.sort(function(a, b) {
|
|
|
|
return b[1] - a[1];
|
|
|
|
});
|
|
|
|
|
|
|
|
let finalWords = []
|
|
|
|
for(let i=0; i<15; i++){
|
2019-07-29 00:22:47 -07:00
|
|
|
if(sortable[i] && sortable[i][0]){
|
2019-07-24 11:06:50 -07:00
|
|
|
finalWords.push(sortable[i][0])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
desiredSearchText += finalWords.join(', ')
|
2019-07-29 00:22:47 -07:00
|
|
|
console.log('TexT Scraped')
|
|
|
|
console.log(desiredSearchText)
|
2019-07-24 11:06:50 -07:00
|
|
|
|
|
|
|
const created = Math.round((+new Date)/1000)
|
|
|
|
|
|
|
|
//Create attachment in DB with scrape text and provided data
|
|
|
|
db.promise()
|
|
|
|
.query(`INSERT INTO attachment
|
|
|
|
(note_id, user_id, attachment_type, text, url, last_indexed)
|
|
|
|
VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
|
|
|
|
.then((rows, fields) => {
|
|
|
|
resolve(desiredSearchText) //Return found text
|
|
|
|
})
|
|
|
|
.catch(console.log)
|
|
|
|
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|