* Added some better base information to site for scrapers
* Updated help text * Refactored a lot of the scrape code into a SiteScrape helper
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
let db = require('@config/database')
|
||||
|
||||
let SiteScrape = require('@helpers/SiteScrape')
|
||||
|
||||
let Attachment = module.exports = {}
|
||||
|
||||
const cheerio = require('cheerio')
|
||||
@@ -242,32 +244,8 @@ Attachment.scanTextForWebsites = (io, userId, noteId, noteText) => {
|
||||
|
||||
Attachment.urlForNote(userId, noteId).then(attachments => {
|
||||
|
||||
//Find all URLs in text
|
||||
//@TODO - Use the process text library for this function
|
||||
const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
|
||||
let allUrls = noteText.match(urlPattern)
|
||||
|
||||
if(allUrls == null){
|
||||
allUrls = []
|
||||
}
|
||||
|
||||
//Every URL needs HTTPS!!!
|
||||
let foundUrls = []
|
||||
allUrls.forEach( (item, index) => {
|
||||
//Every URL should have HTTPS
|
||||
if(item.indexOf('https://') == -1 && item.indexOf('http://') == -1){
|
||||
allUrls[index] = 'https://'+item
|
||||
}
|
||||
//URLs should all have HTTPS!!!
|
||||
if(item.indexOf('http://') >= 0){
|
||||
allUrls[index] = item.replace('http://','https://')
|
||||
}
|
||||
})
|
||||
|
||||
//Remove all duplicates
|
||||
foundUrls = [...new Set(allUrls)]
|
||||
|
||||
|
||||
//Pull all the URLs out of the text
|
||||
let foundUrls = SiteScrape.getCleanUrls(noteText)
|
||||
|
||||
//Go through each saved URL, remove new URLs from saved URLs
|
||||
//If a URL is not found, delete it
|
||||
@@ -386,14 +364,6 @@ Attachment.processUrl = (userId, noteId, url) => {
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
|
||||
const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
|
||||
'give','use','find','tell','ask','work','seem','feel','try','leave','call','good','new','first','last','long','great','little','own','other','old',
|
||||
'right','big','high','different','small','large','next','early','young','important','few','public','bad','same','able','to','of','in','for','on',
|
||||
'with','at','by','from','up','about','into','over','after','the','and','a','that','I','it','not','he','as','you','this','but','his','they','her',
|
||||
'she','or','an','will','my','one','all','would','there','their','and','that','but','or','as','if','when','than','because','while','where','after',
|
||||
'so','though','since','until','whether','before','although','nor','like','once','unless','now','except','are','also','is','your','its']
|
||||
|
||||
var removeWhitespace = /\s+/g
|
||||
|
||||
const options = {
|
||||
uri: url,
|
||||
@@ -428,70 +398,33 @@ Attachment.processUrl = (userId, noteId, url) => {
|
||||
})
|
||||
.then($ => {
|
||||
|
||||
//Clear timeout that would end this function
|
||||
clearTimeout(requestTimeout)
|
||||
|
||||
var desiredSearchText = ''
|
||||
|
||||
let pageTitle = $('title').text().replace(removeWhitespace, " ")
|
||||
desiredSearchText += pageTitle + "\n"
|
||||
|
||||
// let header = $('h1').text().replace(removeWhitespace, " ")
|
||||
// desiredSearchText += header + "\n"
|
||||
|
||||
//Scrape metadata for page image
|
||||
let metadata = $('meta[property="og:image"]')
|
||||
if(metadata && metadata[0] && metadata[0].attribs){
|
||||
thumbnail = metadata[0].attribs.content
|
||||
}
|
||||
const pageTitle = SiteScrape.getTitle($)
|
||||
|
||||
const hostname = SiteScrape.getHostName(url)
|
||||
|
||||
let majorContent = ''
|
||||
majorContent += $('[class*=content]').text()
|
||||
.replace(removeWhitespace, " ") //Remove all whitespace
|
||||
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
||||
.substring(0,3000)
|
||||
.toLowerCase()
|
||||
majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
|
||||
.replace(removeWhitespace, " ") //Remove all whitespace
|
||||
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
|
||||
.substring(0,3000) //Limit characters
|
||||
.toLowerCase()
|
||||
const thumbnail = SiteScrape.getDisplayImage($, url)
|
||||
|
||||
//Count frequency of each word in scraped text
|
||||
let frequency = {}
|
||||
majorContent.split(' ').forEach(word => {
|
||||
if(excludeWords.includes(word)){
|
||||
return //Exclude certain words
|
||||
}
|
||||
if(!frequency[word]){
|
||||
frequency[word] = 0
|
||||
}
|
||||
frequency[word]++
|
||||
const keywords = SiteScrape.getKeywords($)
|
||||
|
||||
var desiredSearchText = ''
|
||||
desiredSearchText += pageTitle + "\n"
|
||||
desiredSearchText += keywords
|
||||
|
||||
console.log({
|
||||
pageTitle,
|
||||
hostname,
|
||||
thumbnail,
|
||||
keywords
|
||||
})
|
||||
|
||||
|
||||
//Create a sortable array
|
||||
var sortable = [];
|
||||
for (var index in frequency) {
|
||||
if(frequency[index] > 1){
|
||||
sortable.push([index, frequency[index]]);
|
||||
}
|
||||
}
|
||||
|
||||
//Sort them by most used words in the list
|
||||
sortable.sort(function(a, b) {
|
||||
return b[1] - a[1];
|
||||
});
|
||||
|
||||
let finalWords = []
|
||||
for(let i=0; i<5; i++){
|
||||
if(sortable[i] && sortable[i][0]){
|
||||
finalWords.push(sortable[i][0])
|
||||
}
|
||||
}
|
||||
|
||||
if(finalWords.length > 0){
|
||||
desiredSearchText += 'Keywords: ' + finalWords.join(', ')
|
||||
}
|
||||
// throw new Error('Ending this function early.')
|
||||
|
||||
|
||||
// console.log('TexT Scraped')
|
||||
@@ -532,9 +465,10 @@ Attachment.processUrl = (userId, noteId, url) => {
|
||||
|
||||
})
|
||||
.catch(error => {
|
||||
console.log('Issue with scrape')
|
||||
// console.log('Scrape pooped out')
|
||||
// console.log('Issue with scrape')
|
||||
console.log(error)
|
||||
resolve('')
|
||||
// resolve('')
|
||||
})
|
||||
|
||||
requestTimeout = setTimeout( () => {
|
||||
|
@@ -167,10 +167,10 @@ Note.update = (io, userId, noteId, noteText, noteTitle, color, pinned, archived,
|
||||
return new Promise((resolve, reject) => {
|
||||
|
||||
//Prevent note loss if it saves with empty text
|
||||
if(ProcessText.removeHtml(noteText) == ''){
|
||||
//if(ProcessText.removeHtml(noteText) == ''){
|
||||
// console.log('Not saving empty note')
|
||||
// resolve(false)
|
||||
}
|
||||
//}
|
||||
|
||||
const now = Math.round((+new Date)/1000)
|
||||
|
||||
|
Reference in New Issue
Block a user