What will be scraped

what

Full code

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const searchString = "star wars";                                // what we want to search

const requestParams = {
  baseURL: `https://www.youtube.com`,
  encodedQuery: encodeURI(searchString),                        // what we want to search for in URI encoding
};

async function scrollPage(page, scrollElements) {
  let currentElement = 0;
  while (true) {
    let elementsLength = await page.evaluate((scrollElements) => {
      return document.querySelectorAll(scrollElements).length;
    }, scrollElements);
    for (; currentElement < elementsLength; currentElement++) {
      await page.waitForTimeout(200);
      await page.evaluate(
        (currentElement, scrollElements) => {
          document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
        },
        currentElement,
        scrollElements
      );
    }
    await page.waitForTimeout(5000);
    let newElementsLength = await page.evaluate((scrollElements) => {
      return document.querySelectorAll(scrollElements).length;
    }, scrollElements);
    if (newElementsLength === elementsLength) break;
  }
}

async function fillDataFromPage(page) {
  const dataFromPage = await page.evaluate((requestParams) => {
    return Array.from(document.querySelectorAll("#contents > ytd-video-renderer")).map((el) => ({
      title: el.querySelector("a#video-title")?.textContent.trim(),
      link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
      channel: {
        name: el.querySelector("#channel-info #channel-name a")?.textContent.trim(),
        link: `${requestParams.baseURL}${el.querySelector("#channel-info > a")?.getAttribute("href")}`,
        thumbnail: el.querySelector("#channel-info > a #img")?.getAttribute("src"),
      },
      publishedDate: el.querySelectorAll("#metadata-line > span")[1]?.textContent.trim(),
      views: el.querySelectorAll("#metadata-line > span")[0]?.textContent.trim(),
      length: el.querySelector("span.ytd-thumbnail-overlay-time-status-renderer")?.textContent.trim(),
      description: el.querySelector(".metadata-snippet-container > yt-formatted-string")?.textContent.trim(),
      extensions: Array.from(el.querySelectorAll("#badges .badge")).map((el) => el.querySelector("span")?.textContent.trim()),
      thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
    }));
  }, requestParams);
  return dataFromPage;
}

async function getYoutubeOrganicResults() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  await page.waitForSelector("#contents > ytd-video-renderer");

  const scrollElements = "#contents > ytd-video-renderer";

  await scrollPage(page, scrollElements);

  await page.waitForTimeout(10000);

  const organicResults = await fillDataFromPage(page);

  await browser.close();

  return organicResults;
}

getYoutubeOrganicResults().then(console.log);

Preparation

First, we need to create a Node.js* project and add npm packages puppeteer, puppeteer-extra and puppeteer-extra-plugin-stealth to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.

To do this, in the directory with our project, open the command line and enter npm init -y, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth.

*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.

📌Note: also, you can use puppeteer without any extensions, but I strongly recommended use it with puppeteer-extra with puppeteer-extra-plugin-stealth to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.

stealth

Process

SelectorGadget Chrome extension was used to grab CSS selectors by clicking on the desired element in the browser. If you have any struggles understanding this, we have a dedicated Web Scraping with CSS Selectors blog post at SerpApi.

The Gif below illustrates the approach of selecting different parts of the results.

how

Code explanation

Declare constants from required libraries:

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
Code Explanation
puppeteer Chromium control library
StealthPlugin library for prevent website detection that you are using web driver

Next, we "say" to puppeteer use StealthPlugin and write what we want to search:

puppeteer.use(StealthPlugin());

const searchString = "star wars";

Next, we write down a function for scrolling page:

async function scrollPage(page, scrollElements) {
  let currentElement = 0;
  while (true) {
    let elementsLength = await page.evaluate((scrollElements) => {
      return document.querySelectorAll(scrollElements).length;
    }, scrollElements);
    for (; currentElement < elementsLength; currentElement++) {
      await page.waitForTimeout(200);
      await page.evaluate(
        (currentElement, scrollElements) => {
          document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
        },
        currentElement,
        scrollElements
      );
    }
    await page.waitForTimeout(5000);
    let newElementsLength = await page.evaluate((scrollElements) => {
      return document.querySelectorAll(scrollElements).length;
    }, scrollElements);
    if (newElementsLength === elementsLength) break;
  }
}
Code Explanation
elementsLength amount of elements on the page before scrolling
page.evaluate() runs code from the brackets in the browser console and returns the result
document.querySelectorAll(scrollElements) returns a static NodeList representing a list of the document's elements that match the css selectors with class name gets from scrollElements argument
page.waitForTimeout(200) waiting 200 ms before continue
newElementsLength amount of elements on the page after scrolling

Next, we write down a function for getting organic results data from the search page:

async function fillDataFromPage(page) {
  const dataFromPage = await page.evaluate((requestParams) => {
    return Array.from(document.querySelectorAll("#contents > ytd-video-renderer")).map((el) => ({
      title: el.querySelector("a#video-title")?.textContent.trim(),
      link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
      channel: {
        name: el.querySelector("#channel-info #channel-name a")?.textContent.trim(),
        link: `${requestParams.baseURL}${el.querySelector("#channel-info > a")?.getAttribute("href")}`,
        thumbnail: el.querySelector("#channel-info > a #img")?.getAttribute("src"),
      },
      publishedDate: el.querySelectorAll("#metadata-line > span")[1]?.textContent.trim(),
      views: el.querySelectorAll("#metadata-line > span")[0]?.textContent.trim(),
      length: el.querySelector("span.ytd-thumbnail-overlay-time-status-renderer")?.textContent.trim(),
      description: el.querySelector(".metadata-snippet-container > yt-formatted-string")?.textContent.trim(),
      extensions: Array.from(el.querySelectorAll("#badges .badge")).map((el) => el.querySelector("span")?.textContent.trim()),
      thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
    }));
  }, requestParams);
  return dataFromPage;
}
Code Explanation
Array.from() this method creates a new, shallow-copied Array instance from an iterable or array-like object
el.querySelector("a#video-title") returns the first html element with selector a#video-title which is any child of the el html element
.textContent gets the raw text of html element
.trim() removes whitespace from both ends of a string
.getAttribute("href") gets the href attribute value of the html element

And finally, a function to control the browser, and get information:

async function getYoutubeOrganicResults() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  await page.waitForSelector("#contents > ytd-video-renderer");

  const scrollElements = "#contents > ytd-video-renderer";

  await scrollPage(page, scrollElements);

  await page.waitForTimeout(10000);

  const organicResults = await fillDataFromPage(page);

  await browser.close();

  return organicResults;
}

getYoutubeOrganicResults().then(console.log);
Code Explanation
puppeteer.launch({options}) this method launches a new instance of the Chromium browser with current options
headless defines which mode to use: headless (by default) or non-headless
args an array with arguments which is used with Chromium
["--no-sandbox", "--disable-setuid-sandbox"] these arguments we use to allow the launch of the browser process in the online IDE
browser.newPage() this method launches a new page
page.setDefaultNavigationTimeout(60000) changing default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection
page.goto(URL) navigation to URL which is defined above
browser.close() after all we close the browser instance

Now we can launch our parser. To do this enter node YOUR_FILE_NAME in your command line. Where YOUR_FILE_NAME is the name of your .js file.

Output

[
  {
    "title": "Star Wars Battlefront 2 - Funny Moments Order #66",
    "link": "https://www.youtube.com/watch?v=LquShRk_3sw",
    "channel": {
      "name": "Jongo Phett",
      "link": "https://www.youtube.com/c/JongoPhett",
      "thumbnail": "https://yt3.ggpht.com/ytc/AKedOLR-k_Ubr0aJgzNu91jAQCc-vnCOpyIkASWxIbm7rQ=s68-c-k-c0x00ffffff-no-rj"
    },
    "publishedDate": "16 hours ago",
    "views": "12K views",
    "length": "10:39",
    "description": "episode 66 of Star Wars Battlefront 2 Funny Moments, a montage of the funniest star wars clips in battlefront II. edited together by ...",
    "extensions": ["New"],
    "thumbnail": "https://i.ytimg.com/vi/LquShRk_3sw/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAFNjI-rEeq5mmDL6I4nXgxZyId3Q"
  },
  {
    "title": "The Most Powerful Character In All of Star Wars",
    "link": "https://www.youtube.com/watch?v=JTTv8mmxoTE",
    "channel": {
      "name": "The Stupendous Wave",
      "link": "https://www.youtube.com/c/TheStupendousWave",
      "thumbnail": "https://yt3.ggpht.com/ytc/AKedOLQ0T0u6VqryQ-Z5efb1qVTcUHthiH8EamJMKDAE=s68-c-k-c0x00ffffff-no-rj"
    },
    "publishedDate": "10 hours ago",
    "views": "29K views",
    "length": "12:22",
    "description": "For all sponsorship and business inquiries please contact: thestupendousscrub@gmail.com Business: ...",
    "extensions": ["New"],
    "thumbnail": "https://i.ytimg.com/vi/JTTv8mmxoTE/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBdHe_wWjGruLfxz1acr-3jP0YltA"
  }
  ...and other results
]

YouTube Video Results API

Alternatively, you can use the YouTube Video Results API from SerpApi.

The difference is that you can still get the same results without using browser automation, which saves time. Also, you don't need to write a parser from scratch, choose the right CSS selectors, which can change. And in the end, there is a possibility that at some point the request may be blocked as suspicious. Instead, you just need to iterate over the structured JSON and get the data you want.

First, we need to install google-search-results-nodejs. To do this you need to enter in your console: npm i google-search-results-nodejs

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY);           //your API key from serpapi.com

const searchString = "star wars";                                       // what we want to search

const params = {
  engine: "youtube",                                                    // search engine
  search_query: searchString,                                           // search query
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

const getResults = async () => {
  const allVideos = [];
  while (true) {
    const json = await getJson();
    if (json.video_results) {
      allVideos.push(...json.video_results);
    } else break;
    if (json.serpapi_pagination?.next_page_token) {
      params.sp = json.serpapi_pagination?.next_page_token;
    } else break;
  }
  return allVideos;
};

getResults().then(console.log);

Code explanation

Declare constants from required libraries:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
Code Explanation
SerpApi SerpApi Node.js library
search new instance of GoogleSearch class
API_KEY your API key from SerpApi

Next, we write down what we want to search and the necessary parameters for making a request:

const searchString = "star wars";

const params = {
  engine: "youtube",
  search_query: searchString,
};
Code Explanation
searchString what we want to search
engine search engine
search_query search query

Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  })
}

And finally, we declare and run the function getResult that gets videos info from all pages and return it:

const getResults = async () => {
  const allVideos = [];
  while (true) {
    const json = await getJson();
    if (json.video_results) {
      allVideos.push(...json.video_results);
    } else break;
    if (json.serpapi_pagination?.next_page_token) {
      params.sp = json.serpapi_pagination?.next_page_token;
    } else break;
  }
  return allVideos;
};

getResults().then(console.log)
Code Explanation
allVideos an array with videos info from all pages
allVideos.push(...json.video_results) in this code, we use spread syntax to split the video_results array from result that was returned from getJson function into elements and add them in the end of allVideos array

Output

[
  {
    "position_on_page": 1,
    "title": "LEGO Star Wars Sets I would DIE FOR! (Part 6)",
    "link": "https://www.youtube.com/watch?v=CGAYy_lqyHk",
    "channel": {
      "name": "LEGO Empire",
      "link": "https://www.youtube.com/c/LEGOEmpireOfficial",
      "thumbnail": "https://yt3.ggpht.com/wJNsooqsZmzGudM2Z0jJ4umj6I9aeHdUfBI8J3d4uC98qKbU5UT6isxzQ-yw5bQ9rBBSo_SY0Eg=s68-c-k-c0x00ffffff-no-rj"
    },
    "published_date": "2 days ago",
    "views": 10024,
    "length": "5:40",
    "description": "LEGO Star Wars Sets I would DIE FOR, Part 6, is HERE! These sets are 18+ LEGO star wars fans dreams come true! Credits to ...",
    "extensions": ["New"],
    "thumbnail": {
      "static": "https://i.ytimg.com/vi/CGAYy_lqyHk/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAudls7QsD4J6_dHDclXgm9Z0lyVg"
    }
  },
  {
    "position_on_page": 2,
    "title": "LEGO Star Wars 75335 BD-1 Review! (2022)",
    "link": "https://www.youtube.com/watch?v=Gega7LfS658",
    "channel": {
      "name": "MandRproductions",
      "link": "https://www.youtube.com/c/MandRproductions",
      "verified": true,
      "thumbnail": "https://yt3.ggpht.com/Jrnvzkoi6Hga08KaZ5Z1N99lmBMFWDMtEGgCHGV6O-m3IYdlJN2gKZV68OVBYVQM2YT8Fq3L=s68-c-k-c0x00ffffff-no-rj"
    },
    "published_date": "3 days ago",
    "views": 45876,
    "length": "7:38",
    "description": "LEGO Star Wars Jedi: Fallen Order sets were thought to be impossible. Enter the 75335 BD-1 Buildable Character Summer 2022 ...",
    "extensions": ["New"],
    "thumbnail": {
      "static": "https://i.ytimg.com/vi/Gega7LfS658/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCX1tTbkRxhZvSw-O5i5Prsp4x8Hw",
      "rich": "https://i.ytimg.com/an_webp/Gega7LfS658/mqdefault_6s.webp?du=3000&sqp=CM6FhJcG&rs=AOn4CLCGISNuZXlgmZVlZmnlEIAZmGE8jA"
    }
  },
   ...and other results
]

If you want to see some projects made with SerpApi, please write me a message.


Join us on Twitter | YouTube

Add a Feature Request💫 or a Bug🐞