What will be scraped

image

Preparation

First, we need to create a Node.js* project and add npm packages puppeteer, puppeteer-extra and puppeteer-extra-plugin-stealth to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.

To do this, in the directory with our project, open the command line and enter npm init -y, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth.

*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.

馃搶Note: also, you can use puppeteer without any extensions, but I strongly recommended use it with puppeteer-extra with puppeteer-extra-plugin-stealth to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.

stealth

Process

SelectorGadget Chrome extension was used to grab CSS selectors by clicking on the desired element in the browser. If you have any struggles understanding this, we have a dedicated Web Scraping with CSS Selectors blog post at SerpApi.

The Gif below illustrates the approach of selecting different parts of the results.

how

Full code

馃搶Notes:

  • To make our search more relevant we need to add GPS coordinates parameter. It has to be constructed in the next sequence: @ + latitude + , + longitude + , + zoom. This will form a string that looks like this: e.g. @47.6040174,-122.1854488,11z. The zoom parameter is optional but recommended for higher precision (it ranges from 3z, map completely zoomed out - to 21z, map completely zoomed in). We have a dedicated video on our YouTube channel explaining What's and Why's about Google Maps GPS Coordinates.
  • Sometimes Google displays results from local places using pagination, and sometimes it loads more results as you scroll. This code works for both cases. If pagination is displayed in your case, you need to uncomment the while loop and internal lines in the getLocalPlacesInfo function.
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const requestParams = {
  baseURL: `http://google.com`,
  query: "starbucks",                                          // what we want to search
  coordinates: "@47.6040174,-122.1854488,11z",                 // parameter defines GPS coordinates of location where you want your query to be applied
  hl: "en",                                                    // parameter defines the language to use for the Google maps search
};

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);

  while (true) {
    await page.evaluate(`document.querySelector("${scrollContainer}").scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(2000);
    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}

async function fillDataFromPage(page) {
  const dataFromPage = await page.evaluate(() => {
    return Array.from(document.querySelectorAll(".bfdHYd")).map((el) => {
      const placeUrl = el.parentElement.querySelector(".hfpxzc")?.getAttribute("href");
      const urlPattern = /!1s(?<id>[^!]+).+!3d(?<latitude>[^!]+)!4d(?<longitude>[^!]+)/gm;                     // https://regex101.com/r/KFE09c/1
      const dataId = [...placeUrl.matchAll(urlPattern)].map(({ groups }) => groups.id)[0];
      const latitude = [...placeUrl.matchAll(urlPattern)].map(({ groups }) => groups.latitude)[0];
      const longitude = [...placeUrl.matchAll(urlPattern)].map(({ groups }) => groups.longitude)[0];
      return {
        title: el.querySelector(".qBF1Pd")?.textContent.trim(),
        rating: el.querySelector(".MW4etd")?.textContent.trim(),
        reviews: el.querySelector(".UY7F9")?.textContent.replace("(", "").replace(")", "").trim(),
        type: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(1) > span:first-child")?.textContent.replaceAll("路", "").trim(),
        address: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(1) > span:last-child")?.textContent.replaceAll("路", "").trim(),
        openState: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(3) > span:first-child")?.textContent.replaceAll("路", "").trim(),
        phone: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(3) > span:last-child")?.textContent.replaceAll("路", "").trim(),
        website: el.querySelector("a[data-value]")?.getAttribute("href"),
        description: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(2)")?.textContent.replace("路", "").trim(),
        serviceOptions: el.querySelector(".qty3Ue")?.textContent.replaceAll("路", "").replaceAll("  ", " ").trim(),
        gpsCoordinates: {
          latitude,
          longitude,
        },
        placeUrl,
        dataId,
      };
    });
  });
  return dataFromPage;
}

async function getLocalPlacesInfo() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  const URL = `${requestParams.baseURL}/maps/search/${requestParams.query}/${requestParams.coordinates}?hl=${requestParams.hl}`;

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  await page.waitForNavigation();

  const scrollContainer = ".m6QErb[aria-label]";

  const localPlacesInfo = [];

  // while (true) {
    await page.waitForTimeout(2000);
    // const nextPageBtn = await page.$("#eY4Fjd:not([disabled])");
    // if (!nextPageBtn) break;
    await scrollPage(page, scrollContainer);
    localPlacesInfo.push(...(await fillDataFromPage(page)));
    // await page.click("#eY4Fjd");
  // }

  await browser.close();

  return localPlacesInfo;
}

getLocalPlacesInfo().then(console.log);

Code explanation

Declare constants from required libraries:

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
Code Explanation
puppeteer Chromium control library
StealthPlugin library for prevent website detection that you are using web driver

Next, we "say" to puppeteer use StealthPlugin:

puppeteer.use(StealthPlugin());

Next, we write what we want to search and the necessary parameters for making a request:

const requestParams = {
  baseURL: `http://google.com`,
  query: "starbucks",
  coordinates: "@47.6040174,-122.1854488,11z", 
  hl: "en", 
};
Code Explanation
query search query
coordinates parameter defines GPS coordinates of location where you want your query to be applied. See more on Google Maps Help
hl parameter defines the language to use for the Google Maps search

Next, we write down a function for scrolling places container on the page:

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);

  while (true) {
    await page.evaluate(`document.querySelector("${scrollContainer}").scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(2000);
    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}
Code Explanation
lastHeight current scrollheight of the container
page.evaluate('document.querySelector... runs code from the brackets in the browser console and returns the result
page.waitForTimeout(2000) waiting 2000 ms before continue
newHeight scrollheight of the container after scroll

Next, we write down a function for getting places info from page:

async function fillDataFromPage(page) {
  const dataFromPage = await page.evaluate(() => {
    return Array.from(document.querySelectorAll(".bfdHYd")).map((el) => {
      const placeUrl = el.parentElement.querySelector(".hfpxzc")?.getAttribute("href");
      const urlPattern = /!1s(?<id>[^!]+).+!3d(?<latitude>[^!]+)!4d(?<longitude>[^!]+)/gm;                     // https://regex101.com/r/KFE09c/1
      const dataId = [...placeUrl.matchAll(urlPattern)].map(({ groups }) => groups.id)[0];
      const latitude = [...placeUrl.matchAll(urlPattern)].map(({ groups }) => groups.latitude)[0];
      const longitude = [...placeUrl.matchAll(urlPattern)].map(({ groups }) => groups.longitude)[0];
      return {
        title: el.querySelector(".qBF1Pd")?.textContent.trim(),
        rating: el.querySelector(".MW4etd")?.textContent.trim(),
        reviews: el.querySelector(".UY7F9")?.textContent.replace("(", "").replace(")", "").trim(),
        type: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(1) > span:first-child")?.textContent.replaceAll("路", "").trim(),
        address: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(1) > span:last-child")?.textContent.replaceAll("路", "").trim(),
        openState: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(3) > span:first-child")?.textContent.replaceAll("路", "").trim(),
        phone: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(3) > span:last-child")?.textContent.replaceAll("路", "").trim(),
        website: el.querySelector("a[data-value]")?.getAttribute("href"),
        description: el.querySelector(".W4Efsd:last-child > .W4Efsd:nth-of-type(2)")?.textContent.replace("路", "").trim(),
        serviceOptions: el.querySelector(".qty3Ue")?.textContent.replaceAll("路", "").replaceAll("  ", " ").trim(),
        gpsCoordinates: {
          latitude,
          longitude,
        },
        placeUrl,
        dataId,
      };
    });
  });
  return dataFromPage;
}
Code Explanation
document.querySelectorAll(".bfdHYd") returns a static NodeList representing a list of the document's elements that match the css selectors with class name bfdHYd
el.querySelector(".qBF1Pd") returns the first html element with selector .qBF1Pd which is any child of the el html element
.getAttribute("href") gets the href attribute value of the html element
urlPattern a RegEx pattern for search and define id, latitude and longitude. See what it allows you to find
[...placeUrl.matchAll(urlPattern)] in this code we use spread syntax to create an array from an iterator that was returned from matchAll method (in this case this entry is equal to Array.from(placeUrl.matchAll(urlPattern)))
.textContent gets the raw text of html element
.trim() removes whitespace from both ends of a string

And finally, a function to control the browser, and get information:

async function getLocalPlacesInfo() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  const URL = `${requestParams.baseURL}/maps/search/${requestParams.query}/${requestParams.coordinates}?hl=${requestParams.hl}`;

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  await page.waitForNavigation();

  const scrollContainer = ".m6QErb[aria-label]";

  const localPlacesInfo = [];

  // while (true) {
    await page.waitForTimeout(2000);
    // const nextPageBtn = await page.$("#eY4Fjd:not([disabled])");
    // if (!nextPageBtn) break;
    await scrollPage(page, scrollContainer);
    localPlacesInfo.push(...(await fillDataFromPage(page)));
    // await page.click("#eY4Fjd");
  // }

  await browser.close();

  return localPlacesInfo;
}

getLocalPlacesInfo().then(console.log);
Code Explanation
puppeteer.launch({options}) this method launches a new instance of the Chromium browser with current options
headless defines which mode to use: headless (by default) or non-headless
args an array with arguments which is used with Chromium
["--no-sandbox", "--disable-setuid-sandbox"] these arguments we use to allow the launch of the browser process in the online IDE
browser.newPage() this method launches a new page
page.setDefaultNavigationTimeout(60000) changing default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection
page.goto(URL) navigation to URL which is defined above
page.$("#eY4Fjd:not([disabled])") this methods finds the html element with the #eY4Fjd:not([disabled]) selector and return it
localPlacesInfo.push(...(await fillDataFromPage(page))) in this code, we use spread syntax to split the array which is returned from the fillDataFromPage function into elements and add them to the end of the localPlacesInfo array
page.click("#eY4Fjd") this methods emulates mouse click on the html element with the #eY4Fjd selector
browser.close() after all we close the browser instance

Now we can launch our parser. To do this enter node YOUR_FILE_NAME in your command line. Where YOUR_FILE_NAME is the name of your .js file.

Output

馃搶Note: if you see something like [Object] in your console you can use console.dir(result, { depth: null }) instead console.log(). Watch Node.js documentation for more info.

[
[
   {
      "title":"Starbucks",
      "rating":"4.2",
      "reviews":"233",
      "type":"Coffee shop",
      "address":"545 Bellevue Square",
      "openState":"Closed 鈰 Opens 7AM",
      "phone":"+1 425-452-5534",
      "website":"https://www.starbucks.com/store-locator/store/18615/",
      "description":"Iconic Seattle-based coffeehouse chain",
      "serviceOptions":"Dine-in   Takeaway   No delivery",
      "gpsCoordinates":{
         "latitude":"47.617077",
         "longitude":"-122.2019599"
      },
      "placeUrl":"https://www.google.com/maps/place/Starbucks/data=!4m7!3m6!1s0x54906c8f50e36025:0x5175a46aeadfbc0f!8m2!3d47.617077!4d-122.2019599!16s%2Fg%2F1thw6fd9!19sChIJJWDjUI9skFQRD7zf6mqkdVE?authuser=0&hl=en&rclk=1",
      "dataId":"0x54906c8f50e36025:0x5175a46aeadfbc0f"
   },
   {
      "title":"Starbucks",
      "reviews":"379",
      "type":"Coffee shop",
      "address":"1785 NE 44th St",
      "openState":"Closed 鈰 Opens 4:30AM",
      "phone":"+1 425-226-7007",
      "website":"https://www.starbucks.com/store-locator/store/10581/",
      "description":"Iconic Seattle-based coffeehouse chain",
      "serviceOptions":"Dine-in   Drive-through   Delivery",
      "gpsCoordinates":{
         "latitude":"47.5319688",
         "longitude":"-122.1942498"
      },
      "placeUrl":"https://www.google.com/maps/place/Starbucks/data=!4m7!3m6!1s0x549069a98254bd17:0xb2f64f75b3edf4c3!8m2!3d47.5319688!4d-122.1942498!16s%2Fg%2F1tdfmzpb!19sChIJF71UgqlpkFQRw_Tts3VP9rI?authuser=0&hl=en&rclk=1",
      "dataId":"0x549069a98254bd17:0xb2f64f75b3edf4c3"
   },
   ...and other results
]

Google Maps Local Results API

Alternatively, you can use the Google Maps Local Results API from SerpApi. SerpApi is a free API with 100 searches per month. If you need more searches, there are paid plans.

The difference is that you won't have to write code from scratch and maintain it. You may also experience blocking from Google and changing selectors which will break the parser. Instead, you just need to iterate the structured JSON and get the data you want. Check out the playground.

First, we need to install google-search-results-nodejs. To do this you need to enter in your console: npm i google-search-results-nodejs

馃搶Note: To make our search more relevant we need to add GPS coordinates parameter. It has to be constructed in the next sequence: @ + latitude + , + longitude + , + zoom. This will form a string that looks like this: e.g. @47.6040174,-122.1854488,11z. The zoom parameter is optional but recommended for higher precision (it ranges from 3z, map completely zoomed out - to 21z, map completely zoomed in). We have a dedicated video on our YouTube channel explaining What's and Why's about Google Maps GPS Coordinates.

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com

const searchString = "starbucks"; // what we want to search

const params = {
  engine: "google_maps", // search engine
  q: searchString, // search query
  hl: "en", // parameter defines the language to use for the Google search
  ll: "@47.6040174,-122.1854488,11z", // parameter defines GPS coordinates of location where you want your query to be applied
  type: "search", // parameter defines the type of search you want to make
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

const getResults = async () => {
  const allPlaces = [];
  while (true) {
    const json = await getJson();
    if (json.local_results) {
      allPlaces.push(...json.local_results)
    } else break;
    if (json.serpapi_pagination?.next) {
      !params.start ? (params.start = 20) : (params.start += 20);
    } else break;
  }
  return allPlaces;
};

getResults.then((result) => console.dir(result, { depth: null }));

Code explanation

Declare constants from required libraries:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
Code Explanation
SerpApi SerpApi Node.js library
search new instance of GoogleSearch class
API_KEY your API key from SerpApi

Next, we write down what we want to search and the necessary parameters for making a request:

const searchString = "starbucks";

const params = {
  engine: "google_maps",
  q: searchString,
  hl: "en",
  ll: "@47.6040174,-122.1854488,11z", // parameter defines GPS coordinates of location where you want your query to be applied
  type: "search", // parameter defines the type of search you want to make
};
Code Explanation
searchString what we want to search
engine search engine
q search query
hl parameter defines the language to use for the Google Scholar search
ll parameter defines GPS coordinates of location where you want your query to be applied
type parameter defines the type of search you want to make

Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  })
}

And finally, we declare and run the function getResult that gets places info from all pages and return it:

const getResults = async () => {
  const allPlaces = [];
  while (true) {
    const json = await getJson();
    if (json.local_results) {
      allPlaces.push(...json.local_results)
    } else break;
    if (json.serpapi_pagination?.next) {
      !params.start ? (params.start = 20) : (params.start += 20);
    } else break;
  }
  return allPlaces;
};

getResults().then((result) => console.dir(result, { depth: null }))
Code Explanation
allPlaces an array with all citations info from all pages
allPlaces.push(...json.local_results) in this code, we use spread syntax to split the local_results array from result that was returned from getJson function into elements and add them in the end of allPlaces array
console.dir(result, { depth: null }) console method dir allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info

Output

[
   {
      "position":1,
      "title":"Starbucks",
      "place_id":"ChIJrxaZdhlBkFQRk-hWRsy4sWA",
      "data_id":"0x54904119769916af:0x60b1b8cc4656e893",
      "data_cid":"6967553286011807891",
      "reviews_link":"https://serpapi.com/search.json?data_id=0x54904119769916af%3A0x60b1b8cc4656e893&engine=google_maps_reviews&hl=en",
      "photos_link":"https://serpapi.com/search.json?data_id=0x54904119769916af%3A0x60b1b8cc4656e893&engine=google_maps_photos&hl=en",
      "gps_coordinates":{
         "latitude":47.544705,
         "longitude":-122.38743199999999
      },
      "place_id_search":"https://serpapi.com/search.json?data=%214m5%213m4%211s0x54904119769916af%3A0x60b1b8cc4656e893%218m2%213d47.544705%214d-122.38743199999999&engine=google_maps&google_domain=google.com&hl=en&start=80&type=place",
      "rating":4.2,
      "reviews":310,
      "price":"$$",
      "type":"Coffee shop",
      "address":"6501 California Ave SW, Seattle, WA 98136, United States",
      "open_state":"Closed 鈰 Opens 5AM",
      "hours":"Closed 鈰 Opens 5AM",
      "operating_hours":{
         "wednesday":"5am鈥5:30pm",
         "thursday":"5am鈥5:30pm",
         "friday":"5am鈥5:30pm",
         "saturday":"5am鈥5:30pm",
         "sunday":"5am鈥5:30pm",
         "monday":"5am鈥5:30pm",
         "tuesday":"5am鈥5:30pm"
      },
      "phone":"+1 206-938-6371",
      "website":"https://www.starbucks.com/store-locator/store/18390/",
      "description":"Iconic Seattle-based coffeehouse chain. Seattle-based coffeehouse chain known for its signature roasts, light bites and WiFi availability.",
      "service_options":{
         "dine_in":true,
         "drive_through":true,
         "delivery":true
      },
      "thumbnail":"https://lh5.googleusercontent.com/p/AF1QipOSvSFJ7cD_s3pemaRs_TjEQe2_aVAy_NhUZVgN=w80-h106-k-no"
   },
   ...and other results
]

If you want to see some projects made with SerpApi, please write me a message.


Join us on Twitter | YouTube

Add a Feature Request馃挮 or a Bug馃悶