fcee24a61d
Tweaked a lot of styles and added some cool animations Added a little to the help text Quickly adding a note, saving and closing no longer causes half formed or empty notes to appear Close Editor animation Display cards text show at the bottom of card Added a delete function, and it works Added browser title attributes More debugging and error checking on scraped links Updated not search to display title and text below the title
195 lines
6.0 KiB
JavaScript
195 lines
6.0 KiB
JavaScript
let db = require('@config/database')
|
|
|
|
let Attachment = module.exports = {}
|
|
|
|
const cheerio = require('cheerio');
|
|
const rp = require('request-promise');
|
|
|
|
Attachment.forNote = (userId, noteId) => {
|
|
return new Promise((resolve, reject) => {
|
|
db.promise()
|
|
.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1;`, [userId, noteId])
|
|
.then((rows, fields) => {
|
|
resolve(rows[0]) //Return all tags found by query
|
|
})
|
|
.catch(console.log)
|
|
})
|
|
}
|
|
|
|
Attachment.delete = (attachmentId) => {
|
|
return new Promise((resolve, reject) => {
|
|
db.promise()
|
|
.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
|
|
.then((rows, fields) => {
|
|
resolve(rows[0]) //Return all tags found by query
|
|
})
|
|
.catch(console.log)
|
|
})
|
|
}
|
|
|
|
Attachment.scanTextForWebsites = (userId, noteId, noteText) => {
|
|
return new Promise((resolve, reject) => {
|
|
|
|
let solrAttachmentText = '' //Final searchable scrape text for note
|
|
|
|
if(noteText.length == 0){ resolve(solrAttachmentText) }
|
|
|
|
Attachment.forNote(userId, noteId).then(attachments => {
|
|
|
|
//Find all URLs in text
|
|
const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
|
|
let allUrls = noteText.match(urlPattern)
|
|
|
|
//Remove all duplicates
|
|
let foundUrls = [...new Set(allUrls)]
|
|
|
|
//Go through each attachment, check for existing URLs
|
|
attachments.forEach(attachment => {
|
|
//URL already scraped, push text and continue
|
|
let urlIndex = foundUrls.indexOf( attachment.url )
|
|
|
|
if(urlIndex != -1){
|
|
solrAttachmentText += attachment.text
|
|
foundUrls.splice(urlIndex, 1) //Remove existing from set of found
|
|
} else {
|
|
Attachment.delete(attachment.id)
|
|
}
|
|
})
|
|
|
|
//No newly scraped URLs, resolve with looked up attachment text
|
|
if(foundUrls == null || foundUrls.length == 0){
|
|
resolve(solrAttachmentText)
|
|
}
|
|
|
|
//Process the remaining URLs into attachments
|
|
Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {
|
|
|
|
solrAttachmentText += freshlyScrapedText
|
|
resolve(solrAttachmentText)
|
|
})
|
|
})
|
|
})
|
|
}
|
|
|
|
//Return scraped text from each URL
|
|
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
|
|
return new Promise((resolve, reject) => {
|
|
|
|
console.log('About to scrape')
|
|
console.log(foundUrls)
|
|
|
|
if(foundUrls == null || foundUrls.length == 0){resolve('')}
|
|
|
|
let processedCount = 0
|
|
let scrapedText = ''
|
|
|
|
//Process each URL passd to function, a DB entry will be created for each scrape
|
|
foundUrls.forEach(url => {
|
|
Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {
|
|
|
|
scrapedText += freshlyScrapedText
|
|
processedCount ++
|
|
|
|
//All URLs have been scraped, return data
|
|
if(processedCount == foundUrls.length){
|
|
resolve(scrapedText)
|
|
}
|
|
})
|
|
})
|
|
})
|
|
}
|
|
|
|
|
|
Attachment.processUrl = (userId, noteId, url) => {
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
|
|
'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
|
|
'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
|
|
'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
|
|
'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
|
|
'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']
|
|
|
|
var removeWhitespace = /\s+/g
|
|
|
|
// console.log('Scraping ', website)
|
|
const options = {
|
|
uri: url,
|
|
transform: function (body) {
|
|
return cheerio.load(body);
|
|
}
|
|
}
|
|
|
|
rp(options).then($ => {
|
|
|
|
var desiredSearchText = ''
|
|
|
|
let pageTitle = $('title').text().replace(removeWhitespace, " ")
|
|
desiredSearchText += pageTitle + "\n"
|
|
|
|
let header = $('h1').text().replace(removeWhitespace, " ")
|
|
desiredSearchText += header + "\n"
|
|
|
|
let majorContent = ''
|
|
majorContent += $('[class*=content]').text()
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
.substring(0,3000)
|
|
.toLowerCase()
|
|
majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
|
|
.replace(removeWhitespace, " ") //Remove all whitespace
|
|
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
|
.substring(0,3000) //Limit characters
|
|
.toLowerCase()
|
|
|
|
//Count frequency of each word in scraped text
|
|
let frequency = {}
|
|
majorContent.split(' ').forEach(word => {
|
|
if(excludeWords.includes(word)){
|
|
return //Exclude certain words
|
|
}
|
|
if(!frequency[word]){
|
|
frequency[word] = 0
|
|
}
|
|
frequency[word]++
|
|
})
|
|
|
|
//Create a sortable array
|
|
var sortable = [];
|
|
for (var index in frequency) {
|
|
if(frequency[index] > 1){
|
|
sortable.push([index, frequency[index]]);
|
|
}
|
|
}
|
|
|
|
//Sort them by most used words in the list
|
|
sortable.sort(function(a, b) {
|
|
return b[1] - a[1];
|
|
});
|
|
|
|
let finalWords = []
|
|
for(let i=0; i<15; i++){
|
|
if(sortable[i] && sortable[i][0]){
|
|
finalWords.push(sortable[i][0])
|
|
}
|
|
}
|
|
|
|
desiredSearchText += finalWords.join(', ')
|
|
console.log('TexT Scraped')
|
|
console.log(desiredSearchText)
|
|
|
|
const created = Math.round((+new Date)/1000)
|
|
|
|
//Create attachment in DB with scrape text and provided data
|
|
db.promise()
|
|
.query(`INSERT INTO attachment
|
|
(note_id, user_id, attachment_type, text, url, last_indexed)
|
|
VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
|
|
.then((rows, fields) => {
|
|
resolve(desiredSearchText) //Return found text
|
|
})
|
|
.catch(console.log)
|
|
|
|
})
|
|
})
|
|
} |