let db = require('@config/database')
let Attachment = module.exports = {}
const cheerio = require('cheerio');
const rp = require('request-promise');
Attachment.forNote = (userId, noteId) => {
return new Promise((resolve, reject) => {
.query(`SELECT * FROM attachment WHERE user_id = ? AND note_id = ? AND attachment_type = 1;`, [userId, noteId])
.then((rows, fields) => {
resolve(rows[0]) //Return all tags found by query
Attachment.delete = (attachmentId) => {
return new Promise((resolve, reject) => {
.query(`DELETE FROM attachment WHERE id = ?`, [attachmentId])
.then((rows, fields) => {
resolve(rows[0]) //Return all tags found by query
Attachment.scanTextForWebsites = (userId, noteId, noteText) => {
return new Promise((resolve, reject) => {
let solrAttachmentText = '' //Final searchable scrape text for note
if(noteText.length == 0){ resolve(solrAttachmentText) }
Attachment.forNote(userId, noteId).then(attachments => {
//Find all URLs in text
const urlPattern = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])/igm
let allUrls = noteText.match(urlPattern)
//Remove all duplicates
let foundUrls = [ Set(allUrls)]
//Go through each attachment, check for existing URLs
attachments.forEach(attachment => {
//URL already scraped, push text and continue
let urlIndex = foundUrls.indexOf( attachment.url )
if(urlIndex != -1){
solrAttachmentText += attachment.text
foundUrls.splice(urlIndex, 1) //Remove existing from set of found
} else {
//No newly scraped URLs, resolve with looked up attachment text
if(foundUrls == null || foundUrls.length == 0){
//Process the remaining URLs into attachments
Attachment.scrapeUrlsCreateAttachments(userId, noteId, foundUrls).then( freshlyScrapedText => {
solrAttachmentText += freshlyScrapedText
//Return scraped text from each URL
Attachment.scrapeUrlsCreateAttachments = (userId, noteId, foundUrls) => {
return new Promise((resolve, reject) => {
console.log('About to scrape')
if(foundUrls == null || foundUrls.length == 0){resolve('')}
let processedCount = 0
let scrapedText = ''
//Process each URL passd to function, a DB entry will be created for each scrape
foundUrls.forEach(url => {
Attachment.processUrl(userId, noteId, url).then( freshlyScrapedText => {
scrapedText += freshlyScrapedText
processedCount ++
//All URLs have been scraped, return data
if(processedCount == foundUrls.length){
Attachment.processUrl = (userId, noteId, url) => {
return new Promise((resolve, reject) => {
const excludeWords = ['share','facebook','twitter','reddit','be','have','do','say','get','make','go','know','take','see','come','think','look','want',
var removeWhitespace = /\s+/g
// console.log('Scraping ', website)
const options = {
uri: url,
simple: true,
timeout: 1000 * 10, // 10 seconds
headers: {
'User-Agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +' //Simulate google headers
transform: function (body) {
return cheerio.load(body);
let requestTimeout = null
let request = rp(options)
.then($ => {
var desiredSearchText = ''
let pageTitle = $('title').text().replace(removeWhitespace, " ")
desiredSearchText += pageTitle + "\n"
let header = $('h1').text().replace(removeWhitespace, " ")
desiredSearchText += header + "\n"
let majorContent = ''
majorContent += $('[class*=content]').text()
.replace(removeWhitespace, " ") //Remove all whitespace
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
majorContent += $('[id*=content]').text().replace(removeWhitespace, " ")
.replace(removeWhitespace, " ") //Remove all whitespace
.replace(/\W\s/g, '') //Remove all non alphanumeric characters
.substring(0,3000) //Limit characters
//Count frequency of each word in scraped text
let frequency = {}
majorContent.split(' ').forEach(word => {
return //Exclude certain words
frequency[word] = 0
//Create a sortable array
var sortable = [];
for (var index in frequency) {
if(frequency[index] > 1){
sortable.push([index, frequency[index]]);
//Sort them by most used words in the list
sortable.sort(function(a, b) {
return b[1] - a[1];
let finalWords = []
for(let i=0; i<15; i++){
if(sortable[i] && sortable[i][0]){
desiredSearchText += finalWords.join(', ')
console.log('TexT Scraped')
const created = Math.round((+new Date)/1000)
//Create attachment in DB with scrape text and provided data
.query(`INSERT INTO attachment
(note_id, user_id, attachment_type, text, url, last_indexed)
VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
.then((rows, fields) => {
resolve(desiredSearchText) //Return found text
.catch(error => {
console.log('Issue with scrape')
requestTimeout = setTimeout( () => {
console.log('Cancel the request, its taking to long.')
desiredSearchText = 'Unable to Scrape URL at this time'
const created = Math.round((+new Date)/1000)
//Create attachment in DB with scrape text and provided data
.query(`INSERT INTO attachment
(note_id, user_id, attachment_type, text, url, last_indexed)
VALUES (?, ?, ?, ?, ?, ?)`, [noteId, userId, 1, desiredSearchText, url, created])
.then((rows, fields) => {
resolve(desiredSearchText) //Return found text
}, (5000))