Email marketing is still #1 lead getter. There are companies that will sell the business info along with their emails, phone numbers etc. But if you are a developer like me, you would want to cut these costs and build your own list of contacts. In this tutorial, we will go over the steps of scraping emails from the websites that are available publicly.
MongoDB is an excellent database to work with and they even have a free version. Check it out MongoDB
Node JS is a superfast non-blocking IO javascript runtime environment built on Chrome's V8 engine. A perfect tool for projects that are not too computational.
Puppeteer is a predominantly headless browser which means it can work in the background and expose features for us (developers) to interact with. Can be used for testing, scraping and auditing websites. With the advent of SPA's (single page applications) where the content is loaded dynamically, traditional scraping will not work on those sites. So we need a real browser to scrape the data. Puppeteer launches chromium browser in the background.
Let's discuss the steps involved in building this email scraper. The whole point of getting emails is to have some sort of campaign going on. We will choose mongDB to hold the data. In my opinion mySQL is a better choice if you want to develop more features like email tracking, event hooks, subscribe, unsubscribe etc. Without further due, let's get into it.
Since we will be running the scraper in loops, it's important to reuse the mongodb connection to prevent memory leaks.
npm i --save mongodb
// mongoPool.js
const mongo = require('mongodb').MongoClient;
let mUrl = '<REPLACE WITH YOUR MONGODB URL>'
var connection = null
function getConnection(cb) {
if (connection != null) {
// console.log("Connection reused")
cb(connection)
} else {
mongo.connect(mUrl, {
useNewUrlParser: true,
useUnifiedTopology: true
}, function(err, db) {
if (err == null) {
// console.log("Connection Created")
connection = db
cb(connection)
} else {
// console.log("Connection Failed " , err)
connection = null
cb(connection)
}
})
}
}
module.exports.getConnection = getConnection
Similarly, we would want to reuse the browser object without closing. Instead, we will open and close the pages programmatically. Puppeteer has all that we need to scrape without requiring any other packages like "cheerio", "request" etc.
npm i --save puppeteer
// google.js
const puppeteer = require('puppeteer');
var browser = null;
async function initializeLaunch(){
if(browser === null){
browser = await puppeteer.launch();
}
}
async function search(query){
try{
await initializeLaunch()
const page = await browser.newPage();
await page.goto('https://www.google.com/');
await page.waitForSelector('input[aria-label="Search"]', {visible: true});
// Inject code here to Scrape results
await page.close()
}catch(err){
await browser.close()
browser = null
}
}
async function closeBrowser(){
await browser.close()
}
module.exports.search = search
module.exports.closeBrowser = closeBrowser
I chose the keyword "Digital Marketing in CT" and ran it through the scraper.
await page.waitForSelector('input[aria-label="Search"]', {visible: true});
await page.waitForSelector(".LC20lb", {visible: true});
await page.evaluate(() =>{
var sites = ['linkedin','merchantcircle','angi','facebook','houzz','pinterest','instagram','twitter','bark','yelp','upcity','signalhire','clover','youtube','zoominfo','pixabay','dandb','dnb','manta','buzzfile','mapquest','smallbusinessdb','bbb','porch','whereisalocal','yellowpages','yellow.place','allpeople','verview','wikipedia','showmelocal','chamberofcommerce','finduslocal']
var nodes = document.querySelectorAll(".LC20lb")
var serp = []
mongoPool((pool) => {
if (pool !== null) { // you can also use "if(pool)" but i went for a cleaner syntax
const dbo = pool.db('myDB').collection("profiles")
for(var n of nodes){
var link = n.parentNode.href
var title = n.innerText
var allow = true
for(var s of sites){
if(link.includes(s)) {allow=false;break;}
}
if(allow) dbo.insertOne({title,link}, function(err, res) {});
}
}
})
});
await page.close()
await browser.close();
browser = null
await page.content()
. Apply javascript regex on the content to filter the emails into an array. These emails need to be unique and not of any false positives.
function extractEmails(text) {
return text.match(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)/gi);
}
function onlyUnique(value, index, self) {
return self.indexOf(value) === index;
}
await page.goto(url, {
waitUntil: 'networkidle0'
});
// await page.waitForNavigation();
var pageContent = await page.content()
var email = extractEmails(pageContent)
if(email !== null) {
email = email.filter(y=>(!y.includes('@2x.png') && !y.includes('@3x.png')))
email = email.filter(onlyUnique)
}
await page.close()
var oid = require('mongodb').ObjectID
db.updateOne({_id : oid(id)}, {$set: {'d.url' : url}}, function(err, result) {
// Rerun for another url (Pagination)
});
When you run puppeteer continuously, it tends to save chrome profiles in the default temp /tmp/
folder (Ubuntu linux). A lot of profiles are saved under snap.chromium/tmp
folder and it requires constant cleanup if you wish not to run out of disk space.
Use this code to delete only those profiles that are not needed.
try{
var fs = require('fs')
var act = '/tmp/snap.chromium/tmp'
var keep = fs.readdirSync('/tmp/')
var remo = fs.readdirSync(act)
remo.map(file => {
if(file.includes('puppeteer_dev_chrome_profile')){
if(keep.indexOf(file) === -1) {
fs.rmdirSync(`${act}/${file}`,{recursive:true})
console.log("Removed", file)
}else{
console.log("Kept", file)
}
}
})
}catch(err){
console.log("Some error", err)
}
This scraper can have extended functionality such as scraping the complete website to increase the chance of getting emails. For example use this with the npm package website-scraper. Or even build a standalone app using Electron JS.
Do you want to access your webcamera remotely? Then this article is for you.
Calendar Picker / Appointment Setter JS CSS Library. Inspired by Calendly.
Create a local file sharing server with realtime sync feature using Socket IO.
Most beautiful Navbars designed with tailwind css. Fully responsive templates.
Most beautiful dashboards designed with bootstrap 5. Inspired mostly from dribble and other sources.
Most commonly used HTML email templates. Welcome email, sign up, password reset etc.
Checkout our most downloaded payment page ui designed with bootstrap.
Detect user's inactivity and auto logout after certain timeout using various JS features.
Keep the user engaged and gain recurring website traffic. Includes templates.
How to get a user's location using Javascript and other techniques available today.
This website uses cookies and similar technologies, to enhance your browsing experience and provide personalized recommendations. By continuing to use our website, you agree to our Privacy policy.