What will be scraped
Using YouTube Search Engine Results API
The difference is that you don't need to use browser automation to scrape results, and write the parser from scratch and maintain it, which saves a lot of time.
There's also a chance that the request might be blocked at some point from Google. Instead, you just need to iterate the structured JSON and get the data you want.
First, we need to install google-search-results-nodejs
. To do this you need to enter in your console: npm i google-search-results-nodejs
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com
const searchString = "star wars"; // what we want to search (for movie results)
// const searchString = "Red Hot Chili Peppers"; // what we want to search (for related search results)
// const searchString = "java course"; // what we want to search (for other results)
const params = {
engine: "youtube", // search engine
search_query: searchString, // search query
};
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const moviesResults = [];
const relatedSearch = [];
const playlists = [];
const channels = [];
const categories = {};
const ads = [];
while (true) {
let categoriesKeys;
const json = await getJson();
if (!json.video_results) {
break;
} else {
categoriesKeys = Object.keys(json).filter((el) => {
if (
el !== "search_metadata" &&
el !== "search_parameters" &&
el !== "search_information" &&
el !== "movie_results" &&
el !== "channel_results" &&
el !== "video_results" &&
el !== "playlist_results" &&
el !== "ads_results" &&
el !== "people_also_search_for" &&
el !== "pagination" &&
el !== "serpapi_pagination"
) {
return true;
} else return false;
});
}
if (json.movie_results) {
moviesResults.push(...json.movie_results);
}
if (json.people_also_search_for) {
relatedSearch.push(...json.people_also_search_for.searches);
}
if (json.playlist_results) {
playlists.push(...json.playlist_results);
}
if (json.channel_results) {
channels.push(...json.channel_results);
}
if (categoriesKeys.length !== 0) {
for (key of categoriesKeys) {
categories[key] = json[key];
}
}
if (json.ads_results) {
ads.push(...json.ads_results);
}
if (json.serpapi_pagination?.next_page_token) {
params.sp = json.serpapi_pagination?.next_page_token;
} else break;
}
return { moviesResults, relatedSearch, playlists, channels, categories, ads };
};
getResults().then((result) => console.dir(result, { depth: null }));
Code explanation
Declare constants from required libraries:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
Code | Explanation |
---|---|
SerpApi |
SerpApi Node.js library |
search |
new instance of GoogleSearch class |
API_KEY |
your API key from SerpApi |
Next, we write down what we want to search and the necessary parameters for making a request:
const searchString = "star wars"; // what we want to search (for movie results)
// const searchString = "Red Hot Chili Peppers"; // what we want to search (for related search results)
// const searchString = "java course"; // what we want to search (for other results)
const params = {
engine: "youtube",
search_query: searchString,
};
❗ Not all search results being viewed are available on the page at the same time, so I use several different searchString
to show all search results.
Code | Explanation |
---|---|
searchString |
what we want to search |
engine |
search engine |
search_query |
search query |
Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
})
}
And finally, we declare and run the function getResult
that gets info from all pages and return it:
const getResults = async () => {
const moviesResults = [];
const relatedSearch = [];
const playlists = [];
const channels = [];
const categories = {};
const ads = [];
while (true) {
let categoriesKeys;
const json = await getJson();
if (!json.video_results) {
break;
} else {
categoriesKeys = Object.keys(json).filter((el) => {
if (
el !== "search_metadata" &&
el !== "search_parameters" &&
el !== "search_information" &&
el !== "movie_results" &&
el !== "channel_results" &&
el !== "video_results" &&
el !== "playlist_results" &&
el !== "ads_results" &&
el !== "people_also_search_for" &&
el !== "pagination" &&
el !== "serpapi_pagination"
) {
return true;
} else return false;
});
}
if (json.movie_results) {
moviesResults.push(...json.movie_results);
}
if (json.people_also_search_for) {
relatedSearch.push(...json.people_also_search_for.searches);
}
if (json.playlist_results) {
playlists.push(...json.playlist_results);
}
if (json.channel_results) {
channels.push(...json.channel_results);
}
if (categoriesKeys.length !== 0) {
for (key of categoriesKeys) {
categories[key] = json[key];
}
}
if (json.ads_results) {
ads.push(...json.ads_results);
}
if (json.serpapi_pagination?.next_page_token) {
params.sp = json.serpapi_pagination?.next_page_token;
} else break;
}
return { moviesResults, relatedSearch, playlists, channels, categories, ads };
};
getResults().then((result) => console.dir(result, { depth: null }));
Code | Explanation |
---|---|
moviesResults, relatedSearch, playlists, channels, categories, ads |
arrays and object with info from all pages |
Object.keys(json).filter((el) => {... |
in this code, we get all keys from json with results, and we filter them to get only keys with category names. We need to do this because categories names are different in different searches |
moviesResults.push(...json.movie_results) |
in this line, we use spread syntax to split the movie_results array from result that was returned from getJson function into elements and add them in the end of moviesResults array |
console.dir(result, { depth: null }) |
console method dir allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info |
Output
📌Note: I've combined the results of different runs of our parser into one for convenience.
{
"moviesResults": [
{
"position_on_page": 1,
"title": "Star Wars: A New Hope",
"link": "https://www.youtube.com/watch?v=yYNSSNJ0z_U",
"channel": {
"name": "YouTube Movies",
"link": "https://www.youtube.com/channel/UClgRkhTL3_hImCAmdLfDE4g",
"verified": true
},
"length": "2:04:44",
"description": "Luke Skywalker begins a journey that will change the galaxy in Star Wars: Episode IV - A New Hope. Nineteen years after the ...",
"info": ["Action & adventure • 1977 • PG • English audio", "Actors: Mark Hamill, Harrison Ford, Carrie Fisher", "Director: George Lucas"],
"extensions": ["4K", "CC"],
"thumbnail": "https://i.ytimg.com/vi_webp/yYNSSNJ0z_U/movieposter.webp"
},
...and other movies search results
],
"relatedSearch": [
{
"query": "Flea",
"link": "https://www.youtube.com/results?search_query=flea+red+hot+chili+peppers&sp=EiuSASgKCi9tLzAxd3d2dDIqGmZsZWEgcmVkIGhvdCBjaGlsaSBwZXBwZXJzeAE%253D",
"thumbnail": "https://lh3.googleusercontent.com/d-6lF1_ytc-rzGeaWkZ5QVPLHhbWSAGIopb9ZnEsGC3119ZOOiBgvIkNHUZ-T2ca5N3CtJiX6NBG6OlY=w320-h180-p-k-c0x00ffffff-no-rj-mo"
},
{
"query": "Foo Fighters",
"link": "https://www.youtube.com/results?search_query=foo+fighters&sp=EhySARkKCS9tLzAycjN6eSoMZm9vIGZpZ2h0ZXJzeAE%253D",
"thumbnail": "https://lh3.googleusercontent.com/AjZ4PKnTltansmzMgehO4gbugmYdST2SO01i254dNq12it5vqaNW2OuYyxuLVWa6m6-zrPEI1oKqyA6k=w320-h180-p-k-c0x00ffffff-no-rj-mo"
},
...and other related search results
],
"playlists": [
{
"position_on_page": 5,
"title": "Java Complete Course | Placement Series",
"link": "https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"channel": {
"name": "Apna College",
"link": "https://www.youtube.com/c/ApnaCollegeOfficial",
"verified": true
},
"video_count": 34,
"videos": [
{
"title": "Introduction to Java Language | Lecture 1 | Complete Placement Course",
"link": "https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"length": "18:46"
},
{
"title": "Variables in Java | Input Output | Complete Placement Course | Lecture 2",
"link": "https://www.youtube.com/watch?v=LusTv0RlnSU&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"length": "42:36"
}
],
"thumbnail": "https://i.ytimg.com/vi/yRpLlJmRo2w/hqdefault.jpg?sqp=-oaymwEWCKgBEF5IWvKriqkDCQgBFQAAiEIYAQ==&rs=AOn4CLD-eck8J8t2xvDyAHNFfsPo5jCFEQ"
},
...and other playlists results
],
"channels": [
{
"position_on_page": 2,
"title": "Star Wars",
"link": "https://www.youtube.com/c/StarWars",
"verified": true,
"subscribers": 3870000,
"video_count": 1596,
"description": "Welcome to the official Star Wars YouTube channel -- home to a galaxy of Star Wars videos including trailers, behind-the-scenes ...",
"thumbnail": "https://yt3.ggpht.com/NLJIsq7K-Qr7AMpHkLstcm9F_ZQzel_CYngyfJvAuBoOzyICVBlpXZzmGlMFqhD1PoV1bJwoxyk=s88-c-k-c0x00ffffff-no-rj-mo"
}
],
"categories": {
"new_for_you": [
{
"position_on_page": 10,
"title": "Java Classes - How To Use Classes in Java",
"link": "https://www.youtube.com/watch?v=vjjjGkXpX_I",
"channel": {
"name": "Alex Lee",
"link": "https://www.youtube.com/c/AlexLeeYT",
"verified": true,
"thumbnail": "https://yt3.ggpht.com/ytc/AKedOLRNFLCMjPktRxac875zHMNjIa4tGp4Dw7AYWTFsJA=s68-c-k-c0x00ffffff-no-rj"
},
"published_date": "2 years ago",
"views": 205431,
"length": "7:20",
"description": "Full Java Course: https://course.alexlorenlee.com/courses/learn-java-fast Get my favorite programming audiobook for free!",
"thumbnail": {
"static": "https://i.ytimg.com/vi/vjjjGkXpX_I/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLDyXVT_EF4J8-9k9svyk64VcOOXXw",
"rich": "https://i.ytimg.com/an_webp/vjjjGkXpX_I/mqdefault_6s.webp?du=3000&sqp=CMaxjpcG&rs=AOn4CLBCDg4Np1TGXpIoMPPosu95Qr5pWg"
}
},
...and other "New for you" category results
],
"people_also_watched": [
{
"position_on_page": 15,
"title": "The Rise and Fall of Java",
"link": "https://www.youtube.com/watch?v=HEdPX8pt_DQ",
"channel": {
"name": "The Science Elf",
"link": "https://www.youtube.com/c/TheScienceElf",
"verified": true,
"thumbnail": "https://yt3.ggpht.com/ytc/AKedOLQn8fCsUjPPL0lbNAFJe_HXgA1EUaKyLoGGPv3ZLg=s68-c-k-c0x00ffffff-no-rj"
},
"published_date": "4 years ago",
"views": 976253,
"length": "10:38",
"description": "Few people know that Java, the worlds most popular programming language, the one that powers smartphone apps and Mars ...",
"thumbnail": {
"static": "https://i.ytimg.com/vi/HEdPX8pt_DQ/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBHtNBTE9it3-RV2VVOK7-8GdqneA",
"rich": "https://i.ytimg.com/an_webp/HEdPX8pt_DQ/mqdefault_6s.webp?du=3000&sqp=CLiZjpcG&rs=AOn4CLDDJrB5EzOmxBvn-cyyrKe2Z7FD3Q"
}
},
...and other "People also watched" category results
]
},
"ads": [
{
"position_on_page": 1,
"title": "Master's Software Development - Boston University MET",
"link": "https://www.google.com/aclk?sa=l&ai=CrTN7ZqPjYpXXNc_cvdMP3NmoCIjPlPppzYm39KkH5eSqMAgAEAEgu5u6TigCYMkGoAHD-uSfA8gBAaoEYU_Q5EsHdk_452XZaC1_fyB79Co1nIBB_lDRdIQ6e7WBrEQHcxEKyh1LGhnkkMfGDA7Gwr5k8GJ2zNeR47inW0f9xxdyzMyhRRezTWheb5TINlG7AtKmlI6zBRbyYwBT6hzABLSM-YfyAYgF7uvbswWgBlGAB6WFm2CQBwGoB9TSG6gHpr4bqAe5mrECqAfz0RuoB-7SG6gH_5yxAqgHytwbqAfYprECoAiYFrAIBdIIDhABIKSBAjIDhsACOgEAsQmpa43r1mel7bkJqRTXltMD4pH4CQGYCwG4DAHoDASCFBsIAhIXamF2YSBwcm9ncmFtbWluZyBjb3Vyc2WIFAHQFQGYFgH4FgGAFwGSFwkSBwgBEAMYmAM&sig=AOD64_3UNCMoEWXo-pEvlsU7AL4hYhyTHA&adurl=https://bumetprograms.bu.edu/software-development/%3Futm_source%3Dgoogle%26utm_medium%3Dcpc%26utm_campaign%3DJava%26utm_content%3DJava%26utm_term%3DJava%2520programming%2520course&ms=[CLICK_MS]&nx=[NX]&ny=[NY]&nb=[NB]",
"displayed_link": "https://bumetprograms.bu.edu/ms_software/development",
"description": "Enhance Your Java Programming Skills with a Master's in Software Development. PhD-Level Faculty. On-Campus or Online. NEASC Accredited.",
"sitelinks": {
"inline": [
{
"title": "Download a Brochure",
"link": "https://www.google.com/aclk?sa=l&ai=C6rADZqPjYpXXNc_cvdMP3NmoCIjPlPppzYm39KkH5eSqMAgAEAEgu5u6TigCYMkGoAHD-uSfA8gBAaoEYU_Q5EsHdk_452XZaC1_fyB79Co1nIBB_lDRdIQ6e7WBrEQHcxEKyh1LGhnkkMfGDA7Gwr5k8GJ2zNeR47inW0f9xxdyzMyhRRezTWheb5TINlG7AtKmlI6zBRbyYwBT6hzABLSM-YfyAYgF7uvbswXABQSgBlGAB6WFm2CQBwGoB9TSG6gHpr4bqAe5mrECqAfz0RuoB-7SG6gH_5yxAqgHytwbqAfYprECoAiYFrAIBcAIAdIIDhABIKSBAjIDhsACOgEAsQmiyQGKGnFB97kJtX0PNynd_5X4CQGYCwHaCwoIHxDX1ufVfhgAuAwB6AwEghQbCAISF2phdmEgcHJvZ3JhbW1pbmcgY291cnNliBQB0BUBmBYB-BYBgBcBkhcJEgcIARADGJgD&sig=AOD64_3deQdic_tvs5A8ofieMdxoK4iI5w&adurl=https://bumetprograms.bu.edu/software-development/%3Futm_source%3Dgoogle%26utm_medium%3Dcpc%26utm_campaign%3D1450636782%26utm_content%3Dsitelink%26utm_term%3DJava%2520programming%2520course&ms=[CLICK_MS]&nx=[NX]&ny=[NY]&nb=[NB]"
},
{
"title": "Online MSSD",
"link": "https://www.google.com/aclk?sa=l&ai=CeQnIZqPjYpXXNc_cvdMP3NmoCIjPlPppzYm39KkH5eSqMAgAEAEgu5u6TigCYMkGoAHD-uSfA8gBAaoEYU_Q5EsHdk_452XZaC1_fyB79Co1nIBB_lDRdIQ6e7WBrEQHcxEKyh1LGhnkkMfGDA7Gwr5k8GJ2zNeR47inW0f9xxdyzMyhRRezTWheb5TINlG7AtKmlI6zBRbyYwBT6hzABLSM-YfyAYgF7uvbswXABQSgBlGAB6WFm2CQBwGoB9TSG6gHpr4bqAe5mrECqAfz0RuoB-7SG6gH_5yxAqgHytwbqAfYprECoAiYFrAIBcAIAtIIDhABIKSBAjIDhsACOgEAsQmvWBWsQIUwkrkJVXFxT_HlaN_4CQGYCwHaCwoIHxDU1ufVfhgAuAwB6AwEghQbCAISF2phdmEgcHJvZ3JhbW1pbmcgY291cnNliBQB0BUBmBYB-BYBgBcBkhcJEgcIARADGJgD&sig=AOD64_1Rt7P1x70ClzRvMPSv0CBiz5yvcQ&adurl=https://www.bu.edu/online/programs/graduate-programs/software-development/%3Futm_source%3Dgoogle%26utm_medium%3Dcpc%26utm_campaign%3D1450636782%26utm_content%3Dsitelink%26utm_term%3DJava%2520programming%2520course&ms=[CLICK_MS]&nx=[NX]&ny=[NY]&nb=[NB]"
},
{
"title": "Dept. of Computer Science",
"link": "https://www.google.com/aclk?sa=l&ai=ChV1FZqPjYpXXNc_cvdMP3NmoCIjPlPppzYm39KkH5eSqMAgAEAEgu5u6TigCYMkGoAHD-uSfA8gBAaoEYU_Q5EsHdk_452XZaC1_fyB79Co1nIBB_lDRdIQ6e7WBrEQHcxEKyh1LGhnkkMfGDA7Gwr5k8GJ2zNeR47inW0f9xxdyzMyhRRezTWheb5TINlG7AtKmlI6zBRbyYwBT6hzABLSM-YfyAYgF7uvbswXABQSgBlGAB6WFm2CQBwGoB9TSG6gHpr4bqAe5mrECqAfz0RuoB-7SG6gH_5yxAqgHytwbqAfYprECoAiYFrAIBcAIA9IIDhABIKSBAjIDhsACOgEAsQkAj4gx90uhzrkJdfLFQTy70Aj4CQGYCwHaCwoIHxDa1ufVfhgAuAwB6AwEghQbCAISF2phdmEgcHJvZ3JhbW1pbmcgY291cnNliBQB0BUBmBYB-BYBgBcBkhcJEgcIARADGJgD&sig=AOD64_0a4KDbvKbdeBAvC-cQKXYh74NwqA&adurl=https://www.bu.edu/met/programs/computer-science-it/%3Futm_source%3Dgoogle%26utm_medium%3Dcpc%26utm_campaign%3D1450636782%26utm_content%3Dsitelink%26utm_term%3DJava%2520programming%2520course&ms=[CLICK_MS]&nx=[NX]&ny=[NY]&nb=[NB]"
},
{
"title": "About BU MET",
"link": "https://www.google.com/aclk?sa=l&ai=CnWPzZqPjYpXXNc_cvdMP3NmoCIjPlPppzYm39KkH5eSqMAgAEAEgu5u6TigCYMkGoAHD-uSfA8gBAaoEYU_Q5EsHdk_452XZaC1_fyB79Co1nIBB_lDRdIQ6e7WBrEQHcxEKyh1LGhnkkMfGDA7Gwr5k8GJ2zNeR47inW0f9xxdyzMyhRRezTWheb5TINlG7AtKmlI6zBRbyYwBT6hzABLSM-YfyAYgF7uvbswXABQSgBlGAB6WFm2CQBwGoB9TSG6gHpr4bqAe5mrECqAfz0RuoB-7SG6gH_5yxAqgHytwbqAfYprECoAiYFrAIBcAIBNIIDhABIKSBAjIDhsACOgEAsQleIh5Ht8Nb9bkJBkfvMNK4_Eb4CQGYCwHaCwoIHxDd1ufVfhgAuAwB6AwEghQbCAISF2phdmEgcHJvZ3JhbW1pbmcgY291cnNliBQB0BUBmBYB-BYBgBcBkhcJEgcIARADGJgD&sig=AOD64_1T3aZKw-_1Y_MxhlWnYb5eJTBXZQ&adurl=https://www.bu.edu/met/about/%3Futm_source%3Dgoogle%26utm_medium%3Dcpc%26utm_campaign%3D1450636782%26utm_content%3Dsitelink%26utm_term%3DJava%2520programming%2520course&ms=[CLICK_MS]&nx=[NX]&ny=[NY]&nb=[NB]"
}
]
}
}
]
DIY Code
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const searchString = "star wars"; // what we want to search (for movie results)
// const searchString = "Red Hot Chili Peppers"; // what we want to search (for related search results)
// const searchString = "java course"; // what we want to search (for other results)
const requestParams = {
baseURL: `https://www.youtube.com`,
encodedQuery: encodeURI(searchString), // what we want to search for in URI encoding
};
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength) break;
}
}
async function fillMovieDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-movie-renderer")).map((el) => ({
title: el.querySelector("a#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-info #channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-info #channel-name a")?.getAttribute("href")}`,
},
length: el.querySelector("span.ytd-thumbnail-overlay-time-status-renderer")?.textContent.trim(),
description: el.querySelector("#description-text")?.textContent.trim(),
info: Array.from(el.querySelectorAll(".movie-metadata-list li")).map((el) => el.textContent.trim()),
extensions: Array.from(el.querySelectorAll(".text-wrapper > ytd-badge-supported-renderer .badge")).map((el) =>
el.querySelector("span")?.textContent.trim()
),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
async function fillRelatedSearchDataFromPage(page) {
while (true) {
const rightArrow = await page.$("#contents > ytd-horizontal-card-list-renderer #right-arrow-container:not([hidden])");
if (rightArrow) {
await page.click("#contents > ytd-horizontal-card-list-renderer #right-arrow-container:not([hidden])");
await page.waitForTimeout(500);
} else break;
}
await page.waitForTimeout(2000);
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-horizontal-card-list-renderer ytd-search-refinement-card-renderer")).map((el) => ({
query: el.querySelector("#card-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a")?.getAttribute("href")}`,
thumbnail: el.querySelector("#img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
async function fillPlaylistsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
const mixes = Array.from(document.querySelectorAll("#contents > ytd-radio-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
const playlists = Array.from(document.querySelectorAll("#contents > ytd-playlist-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-name a")?.getAttribute("href")}`,
},
videoCount: el.querySelector("yt-formatted-string.ytd-thumbnail-overlay-side-panel-renderer")?.textContent.trim(),
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
return [...mixes, ...playlists];
}, requestParams);
return dataFromPage;
}
async function fillChannelsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-channel-renderer")).map((el) => ({
title: el.querySelector("#channel-title #text")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#avatar-section a")?.getAttribute("href")}`,
verified: Boolean(el.querySelector("#channel-title .badge")),
subscribers: el.querySelector("#subscribers")?.textContent.trim(),
description: el.querySelector("#description")?.textContent.trim(),
videoCount: el.querySelector("#video-count")?.textContent.trim(),
thumbnail: el.querySelector("#avatar-section #img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
async function fillCategoriesDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-shelf-renderer")).reduce(
(acc, el) => ({
...acc,
[el.querySelector("#title")?.textContent.trim()]: Array.from(el.querySelectorAll("ytd-video-renderer")).map((el) => ({
title: el.querySelector("a#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-info #channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-info > a")?.getAttribute("href")}`,
thumbnail: el.querySelector("#channel-info > a #img")?.getAttribute("src"),
},
publishedDate: el.querySelectorAll("#metadata-line > span")[1]?.textContent.trim(),
views: el.querySelectorAll("#metadata-line > span")[0]?.textContent.trim(),
length: el.querySelector("span.ytd-thumbnail-overlay-time-status-renderer")?.textContent.trim(),
description: el.querySelector(".metadata-snippet-container > yt-formatted-string")?.textContent.trim(),
extensions: Array.from(el.querySelectorAll("#badges .badge")).map((el) => el.querySelector("span")?.textContent.trim()),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
})),
}),
{}
);
}, requestParams);
return dataFromPage;
}
async function fillAdsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-promoted-sparkles-web-renderer")).map((el) => ({
title: el.querySelector("#title")?.textContent.trim(),
link: el.querySelector("#website-text")?.textContent.trim(),
description: el.querySelector("#description")?.textContent.trim(),
thumbnail: el.querySelector("#thumbnail #img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
async function getYoutubeSearchResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#contents > ytd-video-renderer");
const scrollElements = "#contents > ytd-video-renderer";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const moviesResults = await fillMovieDataFromPage(page);
const relatedSearch = await fillRelatedSearchDataFromPage(page);
const playlists = await fillPlaylistsDataFromPage(page);
const channels = await fillChannelsDataFromPage(page);
const categories = await fillCategoriesDataFromPage(page);
const ads = await fillAdsDataFromPage(page);
await browser.close();
return { moviesResults, relatedSearch, playlists, channels, categories, ads };
}
getYoutubeSearchResults().then((result) => console.dir(result, { depth: null }));
Preparation
First, we need to create a Node.js* project and add npm
packages puppeteer
, puppeteer-extra
and puppeteer-extra-plugin-stealth
to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.
To do this, in the directory with our project, open the command line and enter npm init -y
, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
.
*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.
📌Note: also, you can use puppeteer
without any extensions, but I strongly recommended use it with puppeteer-extra
with puppeteer-extra-plugin-stealth
to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.
Process
SelectorGadget Chrome extension was used to grab CSS selectors by clicking on the desired element in the browser. If you have any struggles understanding this, we have a dedicated Web Scraping with CSS Selectors blog post at SerpApi.
The Gif below illustrates the approach of selecting different parts of the results.
Code explanation
Declare constants from required libraries:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
Code | Explanation |
---|---|
puppeteer |
Chromium control library |
StealthPlugin |
library for prevent website detection that you are using web driver |
Next, we "say" to puppeteer
use StealthPlugin
and write what we want to search:
puppeteer.use(StealthPlugin());
const searchString = "star wars"; // what we want to search (for movie results)
// const searchString = "Red Hot Chili Peppers"; // what we want to search (for related search results)
// const searchString = "java course"; // what we want to search (for other results)
❗ Not all search results being viewed are available on the page at the same time, so I use several different searchString
to show all search results.
Next, we write down a function for scrolling page:
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength) break;
}
}
Code | Explanation |
---|---|
elementsLength |
amount of elements on the page before scrolling |
page.evaluate() |
runs code from the brackets in the browser console and returns the result |
document.querySelectorAll(scrollElements) |
returns a static NodeList representing a list of the document's elements that match the css selectors with class name gets from scrollElements argument |
page.waitForTimeout(200) |
waiting 200 ms before continue |
newElementsLength |
amount of elements on the page after scrolling |
Next, we write functions to get specific data from the search page:
Movie Results
async function fillMovieDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-movie-renderer")).map((el) => ({
title: el.querySelector("a#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-info #channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-info #channel-name a")?.getAttribute("href")}`,
},
length: el.querySelector("span.ytd-thumbnail-overlay-time-status-renderer")?.textContent.trim(),
description: el.querySelector("#description-text")?.textContent.trim(),
info: Array.from(el.querySelectorAll(".movie-metadata-list li")).map((el) => el.textContent.trim()),
extensions: Array.from(el.querySelectorAll(".text-wrapper > ytd-badge-supported-renderer .badge")).map((el) =>
el.querySelector("span")?.textContent.trim()
),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
Related Search Results
async function fillRelatedSearchDataFromPage(page) {
while (true) {
const rightArrow = await page.$("#contents > ytd-horizontal-card-list-renderer #right-arrow-container:not([hidden])");
if (rightArrow) {
await page.click("#contents > ytd-horizontal-card-list-renderer #right-arrow-container:not([hidden])");
await page.waitForTimeout(500);
} else break;
}
await page.waitForTimeout(2000);
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-horizontal-card-list-renderer ytd-search-refinement-card-renderer")).map((el) => ({
query: el.querySelector("#card-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a")?.getAttribute("href")}`,
thumbnail: el.querySelector("#img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
Playlists Results
async function fillPlaylistsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
const mixes = Array.from(document.querySelectorAll("#contents > ytd-radio-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
const playlists = Array.from(document.querySelectorAll("#contents > ytd-playlist-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-name a")?.getAttribute("href")}`,
},
videoCount: el.querySelector("yt-formatted-string.ytd-thumbnail-overlay-side-panel-renderer")?.textContent.trim(),
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
return [...mixes, ...playlists];
}, requestParams);
return dataFromPage;
}
Channels Results
async function fillChannelsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-channel-renderer")).map((el) => ({
title: el.querySelector("#channel-title #text")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#avatar-section a")?.getAttribute("href")}`,
verified: Boolean(el.querySelector("#channel-title .badge")),
subscribers: el.querySelector("#subscribers")?.textContent.trim(),
description: el.querySelector("#description")?.textContent.trim(),
videoCount: el.querySelector("#video-count")?.textContent.trim(),
thumbnail: el.querySelector("#avatar-section #img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
Categories Results
async function fillCategoriesDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-shelf-renderer")).reduce(
(acc, el) => ({
...acc,
[el.querySelector("#title")?.textContent.trim()]: Array.from(el.querySelectorAll("ytd-video-renderer")).map((el) => ({
title: el.querySelector("a#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-info #channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-info > a")?.getAttribute("href")}`,
thumbnail: el.querySelector("#channel-info > a #img")?.getAttribute("src"),
},
publishedDate: el.querySelectorAll("#metadata-line > span")[1]?.textContent.trim(),
views: el.querySelectorAll("#metadata-line > span")[0]?.textContent.trim(),
length: el.querySelector("span.ytd-thumbnail-overlay-time-status-renderer")?.textContent.trim(),
description: el.querySelector(".metadata-snippet-container > yt-formatted-string")?.textContent.trim(),
extensions: Array.from(el.querySelectorAll("#badges .badge")).map((el) => el.querySelector("span")?.textContent.trim()),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
})),
}),
{}
);
}, requestParams);
return dataFromPage;
}
Ads Results
async function fillAdsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
return Array.from(document.querySelectorAll("#contents > ytd-promoted-sparkles-web-renderer")).map((el) => ({
title: el.querySelector("#title")?.textContent.trim(),
link: el.querySelector("#website-text")?.textContent.trim(),
description: el.querySelector("#description")?.textContent.trim(),
thumbnail: el.querySelector("#thumbnail #img")?.getAttribute("src"),
}));
}, requestParams);
return dataFromPage;
}
Code | Explanation |
---|---|
el.querySelector("a#video-title") |
returns the first html element with selector a#video-title which is any child of the el html element |
.textContent |
gets the raw text of html element |
.trim() |
removes whitespace from both ends of a string |
.getAttribute("href") |
gets the href attribute value of the html element |
Array.from() |
this method creates a new, shallow-copied Array instance from an iterable or array-like object. |
page.click(".Dx2nRe") |
this method emulates mouse click on the html element with the .Dx2nRe selector |
[...mixes, ...playlists] |
in this line we use spread syntax to create an array from mixes and playlists arrays |
Boolean() |
when Boolean is called as a function, it coerces the parameter to a boolean primitive |
...acc, |
in this code we use spread syntax to create an object from result that was returned from previous reduce call and add to this object new item from current reduce call |
And finally, a function to control the browser, and get information:
async function getYoutubeSearchResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#contents > ytd-video-renderer");
const scrollElements = "#contents > ytd-video-renderer";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const moviesResults = await fillMovieDataFromPage(page);
const relatedSearch = await fillRelatedSearchDataFromPage(page);
const playlists = await fillPlaylistsDataFromPage(page);
const channels = await fillChannelsDataFromPage(page);
const categories = await fillCategoriesDataFromPage(page);
const ads = await fillAdsDataFromPage(page);
await browser.close();
return { moviesResults, relatedSearch, playlists, channels, categories, ads };
}
getYoutubeSearchResults().then((result) => console.dir(result, { depth: null }));
Code | Explanation |
---|---|
puppeteer.launch({options}) |
this method launches a new instance of the Chromium browser with current options |
headless |
defines which mode to use: headless (by default) or non-headless |
args |
an array with arguments which is used with Chromium |
["--no-sandbox", "--disable-setuid-sandbox"] |
these arguments we use to allow the launch of the browser process in the online IDE |
browser.newPage() |
this method launches a new page |
page.setDefaultNavigationTimeout(60000) |
changing default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection |
page.goto(URL) |
navigation to URL which is defined above |
browser.close() |
after all we close the browser instance |
console.dir(result, { depth: null }) |
console method dir allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info |
Now we can launch our parser. To do this enter node YOUR_FILE_NAME
in your command line. Where YOUR_FILE_NAME
is the name of your .js
file.
Output
📌Note: I've combined the results of different runs of our parser into one for convenience.
{
"moviesResults": [
{
"title": "Star Wars: A New Hope",
"link": "https://www.youtube.com/watch?v=5OQ4qB6XQaw",
"channel": {
"name": "YouTube Movies",
"link": "https://www.youtube.com/channel/UClgRkhTL3_hImCAmdLfDE4g"
},
"length": "2:04:43",
"description": "Young farm boy Luke Skywalker is thrust into a galaxy of adventure when he intercepts a distress call from the captive Princess ...",
"info": ["Science fiction • 1977 • English audio", "Actors: Mark Hamill, Harrison Ford, Carrie Fischer", "Director: George Lucas"],
"extensions": ["CC"],
"thumbnail": "https://i.ytimg.com/vi_webp/5OQ4qB6XQaw/movieposter.webp"
},
...and other movies results
],
"relatedSearch": [
{
"query": "Foo Fighters",
"link": "https://www.youtube.com/results?search_query=foo+fighters&sp=EhySARkKCS9tLzAycjN6eSoMZm9vIGZpZ2h0ZXJzeAE%253D",
"thumbnail": "https://lh3.googleusercontent.com/AjZ4PKnTltansmzMgehO4gbugmYdST2SO01i254dNq12it5vqaNW2OuYyxuLVWa6m6-zrPEI1oKqyA6k=w320-h180-p-k-c0x00ffffff-no-rj-mo"
},
{
"query": "Pearl Jam",
"link": "https://www.youtube.com/results?search_query=pearl+jam&sp=EhmSARYKCS9tLzBkMTkzaCoJcGVhcmwgamFteAE%253D",
"thumbnail": "https://lh3.googleusercontent.com/btX91X6V9ZMo6c59bk5PeJqQFp2dKAHKMEV0tnOKybdp_agkj-UwpQt6DGYpNeuz-6bBlrl0nTAi4w=w320-h180-p-k-c0x00ffffff-no-rj-mo"
},
...and other related search results
],
"playlists": [
{
"title": "Mix - Java course",
"link": "https://www.youtube.com/watch?v=WOUpjal8ee4&list=RDQMRcaAqgwo2VM&start_radio=1",
"videos": [
{
"title": "1.1 How to be a Java Programmer | What is Java",
"link": "https://www.youtube.com/watch?v=WOUpjal8ee4&list=RDQMRcaAqgwo2VM&start_radio=1",
"length": "17:19"
},
{
"title": "Collection and Generics in Java",
"link": "https://www.youtube.com/watch?v=5NOLuoG6fcQ&list=RDQMRcaAqgwo2VM&start_radio=1",
"length": "14:32"
}
],
"thumbnail": "https://i.ytimg.com/vi/WOUpjal8ee4/hqdefault.jpg?sqp=-oaymwEXCNACELwBSFryq4qpAwkIARUAAIhCGAE=&rs=AOn4CLBuT5Pm-qqxXa1G8FkqQ6hc72uS5g"
},
{
"title": "Java Programming",
"link": "https://www.youtube.com/watch?v=VHbSopMyc4M&list=PLBlnK6fEyqRjKA_NuK9mHmlk0dZzuP1P5",
"channel": {
"name": "Neso Academy",
"link": "https://www.youtube.com/c/nesoacademy"
},
"videoCount": "129",
"videos": [
{
"title": "Why take this Java Course?",
"link": "https://www.youtube.com/watch?v=VHbSopMyc4M&list=PLBlnK6fEyqRjKA_NuK9mHmlk0dZzuP1P5",
"length": "4:29"
},
{
"title": "Programs and Programming Languages",
"link": "https://www.youtube.com/watch?v=-C88r0niLQQ&list=PLBlnK6fEyqRjKA_NuK9mHmlk0dZzuP1P5",
"length": "8:03"
}
],
"thumbnail": "https://i.ytimg.com/vi/VHbSopMyc4M/hqdefault.jpg?sqp=-oaymwEXCNACELwBSFryq4qpAwkIARUAAIhCGAE=&rs=AOn4CLBoEkrSHwD3jOdu5lTCnVCs9MZAmQ"
},
...and other playlists results
],
"channels": [
{
"title": "IIT Madras - BSc Degree Programme",
"link": "https://www.youtube.com/c/IITMadrasBScDegreeProgramme",
"verified": false,
"subscribers": "80.3K subscribers",
"description": "IIT Madras welcomes you to the world's first BSc Degree program in Programming and Data Science. This program was designed ...",
"videoCount": "3,051 videos",
"thumbnail": "//yt3.ggpht.com/ytc/AKedOLQPxSKKSP5nPwrA4yQmr3neKedTY5ukYioAbiIk=s176-c-k-c0x00ffffff-no-rj-mo"
},
...and other channels results
],
"categories": {
"New for you": [
{
"title": "Java Classes - How To Use Classes in Java",
"link": "https://www.youtube.com/watch?v=vjjjGkXpX_I",
"channel": {
"name": "Alex Lee",
"link": "https://www.youtube.com/c/AlexLeeYT",
"thumbnail": "https://yt3.ggpht.com/ytc/AKedOLRNFLCMjPktRxac875zHMNjIa4tGp4Dw7AYWTFsJA=s68-c-k-c0x00ffffff-no-rj"
},
"publishedDate": "2 years ago",
"views": "205K views",
"length": "7:20",
"extensions": [],
"thumbnail": "https://i.ytimg.com/vi/vjjjGkXpX_I/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLDyXVT_EF4J8-9k9svyk64VcOOXXw"
},
...and other "New for you" category results
],
"People also watched": [
{
"title": "О работе junior-разработчика",
"link": "https://www.youtube.com/watch?v=qOPNBsLtTUc",
"channel": {
"name": "Sergey Nemchinskiy",
"link": "https://www.youtube.com/c/SergeyNemchinskiy",
"thumbnail": "https://yt3.ggpht.com/ytc/AKedOLTUvbjXCCrK-KZhEPmNWuK1UMQqQ8shwXTaeZUk-g=s68-c-k-c0x00ffffff-no-rj"
},
"views": "780 watching",
"description": "Всем привет! Мы прикрутили возможность для донатов: https://bit.ly/3zDKgux . В поле Комментарий вписываете свой вопрос ...",
"extensions": ["LIVE", "New"],
"thumbnail": "https://i.ytimg.com/vi/qOPNBsLtTUc/hq720.jpg?sqp=-oaymwEcCOgCEMoBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAbCn3lGhoOJZKYA3vhO7NWZe7CqQ"
},
...and other "People also watched" category results
]
},
"ads": [
{
"title": "Course Java - Coding for All Levels",
"link": "https://www.codecademy.com/get-started/free",
"description": "Master your language with lessons, quizzes, and projects designed for real-life scenarios.",
"thumbnail": "https://tpc.googlesyndication.com/simgad/16256156884054071374"
},
...and other ads results
]
}
Links
If you want to see some projects made with SerpApi, please write me a message.
Add a Feature Request💫 or a Bug🐞