Scrape all Google Scholar Profile, Author Results to CSV with Python and SerpApi

What will be scraped


Prerequisites

Separate virtual environment

In short, it's a thing that creates an independent set of installed libraries including different Python versions that can coexist with each other at the same system, thus preventing libraries or Python version conflicts.

If you haven't worked with a virtual environment before, have a look at the dedicated Python virtual environments tutorial using Virtualenv and Poetry blog post of mine to get familiar.

Install libraries:

pip install pandas google-search-results 

Process

If you don't need an explanation:

Scrape all Google Scholar Profile Results

from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

def profile_results():

    print("Extracting profile results..")

    params = {
        "api_key": "...",                     # https://serpapi.com/manage-api-key
        "engine": "google_scholar_profiles",  # profile results search engine
        "mauthors": "blizzard",               # search query
    }
    search = GoogleSearch(params)

    profile_results_data = []

    profiles_is_present = True
    while profiles_is_present:

        profile_results = search.get_dict()

        for profile in profile_results.get("profiles", []):

            print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')

            thumbnail = profile.get("thumbnail")
            name = profile.get("name")
            link = profile.get("link")
            author_id = profile.get("author_id")
            affiliations = profile.get("affiliations")
            email = profile.get("email")
            cited_by = profile.get("cited_by")
            interests = profile.get("interests")

            profile_results_data.append({
                "thumbnail": thumbnail,
                "name": name,
                "link": link,
                "author_id": author_id,
                "email": email,
                "affiliations": affiliations,
                "cited_by": cited_by,
                "interests": interests
            })

        if "next" in profile_results["pagination"]:
            search.params_dict.update(dict(parse_qsl(urlsplit(profile_results["pagination"]["next"]).query)))
        else:
            profiles_is_present = False

    return profile_results_data

Scraping all profile results explanation

Import libraries:

from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

Pass search parameters to SerpApi and create a temporary list:

params = {
    "api_key": "...",                     # https://serpapi.com/manage-api-key
    "engine": "google_scholar_profiles",  # profile results search engine
    "mauthors": "blizzard",               # search query
}
search = GoogleSearch(params)

profile_results_data = []

Set up a while loop and add an if statement to exit the while loop if no pages are left:

profiles_is_present = True
while profiles_is_present:

    profile_results = search.get_dict()
    
    # for loop extraction here..
    
    # if next page in SerpApi pagination -> update params to new a page results.
    # if no next page -> exit the while loop.
    if "next" in profile_results.get("pagination", []):
        search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query)))
    else:
        profiles_is_present = False

Iterate over profile results in a for loop:

for profile in profile_results.get("profiles", []):

    print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')

    thumbnail = profile.get("thumbnail")
    name = profile.get("name")
    link = profile.get("link")
    author_id = profile.get("author_id")
    affiliations = profile.get("affiliations")
    email = profile.get("email")
    cited_by = profile.get("cited_by")
    interests = profile.get("interests")

Append extracted data to temporary list as a dictionary and return it:

profile_results_data.append({
    "thumbnail": thumbnail,
    "name": name,
    "link": link,
    "author_id": author_id,
    "email": email,
    "affiliations": affiliations,
    "cited_by": cited_by,
    "interests": interests

return profile_results_data

# example output:
'''
Extracting profile results..
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
... other profiles

[
  {
    "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=_xwYD2sAAAAJ&citpid=3",
    "name": "Adam Lobel",
    "link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ",
    "author_id": "_xwYD2sAAAAJ",
    "email": "Verified email at AdamLobel.com",
    "affiliations": "Blizzard Entertainment",
    "cited_by": 2935,
    "interests": [
      {
        "title": "Gaming",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:gaming"
      },
      {
        "title": "Emotion regulation",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:emotion_regulation"
      }
    ]
  },
  ... other profiles
]
'''

Scrape Google Scholar Author Results

from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

def author_results():

    print("extracting author results..")

    author_results_data = []

    for author_id in profile_results():

        print(f"Parsing {author_id['author_id']} author ID.")

        params = {
            "api_key": "...",                     # https://serpapi.com/manage-api-key
            "engine": "google_scholar_author",    # author results search engine
            "author_id": author_id["author_id"],  # search query
            "hl": "en"
        }
        search = GoogleSearch(params)
        results = search.get_dict()

        thumbnail = results.get("author").get("thumbnail")
        name = results.get("author").get("name")
        affiliations = results.get("author").get("affiliations")
        email = results.get("author").get("email")
        website = results.get("author").get("website")
        interests = results.get("author").get("interests")

        cited_by_table = results.get("cited_by", {}).get("table")
        cited_by_graph = results.get("cited_by", {}).get("graph")

        public_access_link = results.get("public_access", {}).get("link")
        available_public_access = results.get("public_access", {}).get("available")
        not_available_public_access = results.get("public_access", {}).get("not_available")
        co_authors = results.get("co_authors")

        author_results_data.append({
          "thumbnail": thumbnail,
          "name": name,
          "affiliations": affiliations,
          "email": email,
          "website": website,
          "interests": interests,
          "cited_by_table": cited_by_table,
          "cited_by_graph": cited_by_graph,
          "public_access_link": public_access_link,
          "available_public_access": available_public_access,
          "not_available_public_access": not_available_public_access,
          "co_authors": co_authors
        })

    return author_results_data

Scraping author results explanation

Import profile_results() function and other libraries:

from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

profile_results() will iterate over all available pages and return a dictionary including author ID result, for example _xwYD2sAAAAJ.

Create temporary list to store extracted data:

author_results_data = []

Iterate over extracted profiles, passauthor_id to author_id search parameter:

for author_id in profile_results():

    print(f"Parsing {author_id['author_id']} author ID.")

    params = {
        "api_key": "...",                     # https://serpapi.com/manage-api-key
        "engine": "google_scholar_author",    # author results search engine
        "author_id": author_id["author_id"],  # search query: _xwYD2sAAAAJ
        "hl": "en"
    }
    search = GoogleSearch(params)
    results = search.get_dict()

Extract the data:

thumbnail = results.get("author").get("thumbnail")
name = results.get("author").get("name")
affiliations = results.get("author").get("affiliations")
email = results.get("author").get("email")
website = results.get("author").get("website")
interests = results.get("author").get("interests")

cited_by_table = results.get("cited_by", {}).get("table")
cited_by_graph = results.get("cited_by", {}).get("graph")

public_access_link = results.get("public_access", {}).get("link")
available_public_access = results.get("public_access", {}).get("available")
not_available_public_access = results.get("public_access", {}).get("not_available")
co_authors = results.get("co_authors")

Append extracted data to temporary list as a dictionary and return it:

author_results_data.append({
    "thumbnail": thumbnail,
    "name": name,
    "affiliations": affiliations,
    "email": email,
    "website": website,
    "interests": interests,
    "cited_by_table": cited_by_table,
    "cited_by_graph": cited_by_graph,
    "public_access_link": public_access_link,
    "available_public_access": available_public_access,
    "not_available_public_access": not_available_public_access,
    "co_authors": co_authors
})

return author_results_data


# example output:
'''
extracting author results..
Extracting profile results..
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
... other authors
Parsing _xwYD2sAAAAJ author ID.
... other authors

[
  {
    "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=view_photo&user=_xwYD2sAAAAJ&citpid=3",
    "name": "Adam Lobel",
    "affiliations": "Blizzard Entertainment",
    "email": "Verified email at AdamLobel.com",
    "website": "https://twitter.com/GrowingUpGaming",
    "interests": [
      {
        "title": "Gaming",
        "link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:gaming",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming"
      },
      {
        "title": "Emotion regulation",
        "link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:emotion_regulation",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation"
      }
    ],
    "cited_by_table": [
      {
        "citations": {
          "all": 2935,
          "since_2017": 2348
        }
      },
      {
        "h_index": {
          "all": 10,
          "since_2017": 10
        }
      },
      {
        "i10_index": {
          "all": 11,
          "since_2017": 10
        }
      }
    ],
    "cited_by_graph": [
      {
        "year": 2014,
        "citations": 70
      },
      {
        "year": 2015,
        "citations": 188
      },
      {
        "year": 2016,
        "citations": 243
      },
      {
        "year": 2017,
        "citations": 342
      },
      {
        "year": 2018,
        "citations": 420
      },
      {
        "year": 2019,
        "citations": 553
      },
      {
        "year": 2020,
        "citations": 507
      },
      {
        "year": 2021,
        "citations": 504
      },
      {
        "year": 2022,
        "citations": 16
      }
    ],
    "public_access_link": "https://scholar.google.com/citations?view_op=list_mandates&hl=en&user=_xwYD2sAAAAJ",
    "available_public_access": 1,
    "not_available_public_access": 0,
    "co_authors": [
      {
        "name": "Isabela Granic",
        "link": "https://scholar.google.com/citations?user=4T5cjVIAAAAJ&hl=en",
        "serpapi_link": "https://serpapi.com/search.json?author_id=4T5cjVIAAAAJ&engine=google_scholar_author&hl=en",
        "author_id": "4T5cjVIAAAAJ",
        "affiliations": "Radboud University Nijmegen",
        "email": "Verified email at pwo.ru.nl",
        "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=4T5cjVIAAAAJ&citpid=4"
      },
      ... other co-authors
      }
    ]
  }
  ... other authors
]
'''

Scrape all Author Articles from Google Scholar

from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

def all_author_articles():

    author_article_results_data = []

    for index, author_id in enumerate(profile_results(), start=1):

        print(f"Parsing {index} author with {author_id['author_id']} author ID.")

        params = {
            "api_key": "...",                    # https://serpapi.com/manage-api-key
            "engine": "google_scholar_author",   # author results search engine
            "hl": "en",                          # language
            "sort": "pubdate",                   # sort by year
            "author_id": author_id["author_id"]  # search query
        }
        search = GoogleSearch(params)

        articles_is_present = True
        while articles_is_present:

            results = search.get_dict()

            for article in results.get("articles", []):
                title = article.get("title")
                link = article.get("link")
                citation_id = article.get("citation_id")
                authors = article.get("authors")
                publication = article.get("publication")
                cited_by_value = article.get("cited_by", {}).get("value")
                cited_by_link = article.get("cited_by", {}).get("link")
                cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
                year = article.get("year")
  
                author_article_results_data.append({
                    "article_title": title,
                    "article_link": link,
                    "article_year": year,
                    "article_citation_id": citation_id,
                    "article_authors": authors,
                    "article_publication": publication,
                    "article_cited_by_value": cited_by_value,
                    "article_cited_by_link": cited_by_link,
                    "article_cited_by_cites_id": cited_by_cites_id,
                })
    
          if "next" in results.get("serpapi_pagination", []):
              search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
          else:
              articles_is_present = False

    return author_article_results_data

Scraping all author articles explanation

Import profile_results() function and other libraries:

from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

In this case, profile_results() was used to get author_id as well, in order to parse author articles.

Create temporary list to store extracted data:

author_article_results_data = []

Iterate over profile_results() and pass author_id to parameter search query:

for index, author_id in enumerate(profile_results(), start=1):

    print(f"Parsing {index} author with {author_id['author_id']} author ID.")
  
    params = {
        "api_key": "...",                    # https://serpapi.com/manage-api-key
        "engine": "google_scholar_author",   # author results search engine
        "hl": "en",                          # language
        "sort": "pubdate",                   # sort by year
        "author_id": author_id["author_id"]  # search query
    }
    search = GoogleSearch(params)

Set up a while loop and check if next page is present:

articles_is_present = True
while articles_is_present:
    results = search.get_dict()
    
    # data extraction code..
    
    # if next page is present -> update previous results to new page results.
    # if next page is not present -> exit the while loop.
    if "next" in results.get("serpapi_pagination", []):
      search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
    else:
      articles_is_present = False

Extract data in a for loop:

for article in results.get("articles", []):
    title = article.get("title")
    link = article.get("link")
    citation_id = article.get("citation_id")
    authors = article.get("authors")
    publication = article.get("publication")
    cited_by_value = article.get("cited_by", {}).get("value")
    cited_by_link = article.get("cited_by", {}).get("link")
    cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
    year = article.get("year")

Append extracted data to temporary list as a dictionary:

author_article_results_data.append({
    "article_title": title,
    "article_link": link,
    "article_year": year,
    "article_citation_id": citation_id,
    "article_authors": authors,
    "article_publication": publication,
    "article_cited_by_value": cited_by_value,
    "article_cited_by_link": cited_by_link,
    "article_cited_by_cites_id": cited_by_cites_id,
})

Return extracted data:

return author_article_results_data

Save Google Scholar Profile and Author Results to CSV

from google_scholar_profile_results import profile_results
import pandas as pd

def save_profile_results_to_csv():
    print("Waiting for profile results to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)

    print("Profile Results Saved.")

    
def save_author_result_to_csv():
    print("Waiting for author results to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)

    print("Author Results Saved.")


def save_author_articles_to_csv():
    print("Waiting for author articles to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)

    print("Author Articles Saved.")
  • data argument inside DataFrame is your data.
  • encoding='utf-8' argument just to make sure everything will be saved correctly. I used it explicitly even thought it's a default value.
  • index=False argument to drop default pandas row numbers.

Full Code

from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd


def profile_results():
    print("Extracting profile results..")

    params = {
        "api_key": "...",                     # https://serpapi.com/manage-api-key
        "engine": "google_scholar_profiles",  # profile results search engine
        "mauthors": "blizzard",               # search query
    }
    search = GoogleSearch(params)

    profile_results_data = []

    profiles_is_present = True
    while profiles_is_present:
        profile_results = search.get_dict()

        for profile in profile_results.get("profiles", []):

            print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')

            thumbnail = profile.get("thumbnail")
            name = profile.get("name")
            link = profile.get("link")
            author_id = profile.get("author_id")
            affiliations = profile.get("affiliations")
            email = profile.get("email")
            cited_by = profile.get("cited_by")
            interests = profile.get("interests")

            profile_results_data.append({
                "thumbnail": thumbnail,
                "name": name,
                "link": link,
                "author_id": author_id,
                "email": email,
                "affiliations": affiliations,
                "cited_by": cited_by,
                "interests": interests
            })

        if "next" in profile_results.get("pagination", []):
            search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query)))
        else:
            profiles_is_present = False

    return profile_results_data


def author_results():
    print("extracting author results..")

    author_results_data = []

    for author_id in profile_results():

        print(f"Parsing {author_id['author_id']} author ID.")

        params = {
            # https://docs.python.org/3/library/os.html#os.getenv
            "api_key": os.getenv("API_KEY"),      # SerpApi API key
            "engine": "google_scholar_author",    # author results search engine
            "author_id": author_id["author_id"],  # search query
            "hl": "en"
        }
        search = GoogleSearch(params)
        results = search.get_dict()

        thumbnail = results.get("author").get("thumbnail")
        name = results.get("author").get("name")
        affiliations = results.get("author").get("affiliations")
        email = results.get("author").get("email")
        website = results.get("author").get("website")
        interests = results.get("author").get("interests")

        cited_by_table = results.get("cited_by", {}).get("table")
        cited_by_graph = results.get("cited_by", {}).get("graph")

        public_access_link = results.get("public_access", {}).get("link")
        available_public_access = results.get("public_access", {}).get("available")
        not_available_public_access = results.get("public_access", {}).get("not_available")
        co_authors = results.get("co_authors")

        author_results_data.append({
            "thumbnail": thumbnail,
            "name": name,
            "affiliations": affiliations,
            "email": email,
            "website": website,
            "interests": interests,
            "cited_by_table": cited_by_table,
            "cited_by_graph": cited_by_graph,
            "public_access_link": public_access_link,
            "available_public_access": available_public_access,
            "not_available_public_access": not_available_public_access,
            "co_authors": co_authors
        })

    return author_results_data


def all_author_articles():
    author_article_results_data = []

    for index, author_id in enumerate(profile_results(), start=1):

        print(f"Parsing author #{index} with {author_id['author_id']} author ID.")

        params = {
            "api_key": "...",                    # https://serpapi.com/manage-api-key
            "engine": "google_scholar_author",   # author results search engine
            "hl": "en",                          # language
            "sort": "pubdate",                   # sort by year
            "author_id": author_id["author_id"]  # search query
        }
        search = GoogleSearch(params)

        articles_is_present = True
        while articles_is_present:
            results = search.get_dict()

            for article in results.get("articles", []):
                title = article.get("title")
                link = article.get("link")
                citation_id = article.get("citation_id")
                authors = article.get("authors")
                publication = article.get("publication")
                cited_by_value = article.get("cited_by", {}).get("value")
                cited_by_link = article.get("cited_by", {}).get("link")
                cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
                year = article.get("year")

                author_article_results_data.append({
                    "article_title": title,
                    "article_link": link,
                    "article_year": year,
                    "article_citation_id": citation_id,
                    "article_authors": authors,
                    "article_publication": publication,
                    "article_cited_by_value": cited_by_value,
                    "article_cited_by_link": cited_by_link,
                    "article_cited_by_cites_id": cited_by_cites_id,
                })

            if "next" in results.get("serpapi_pagination", []):
                search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
            else:
                articles_is_present = False

    return author_article_results_data


def save_author_result_to_csv():
    print("Waiting for author results to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)

    print("Author Results Saved.")


def save_author_articles_to_csv():
    print("Waiting for author articles to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)

    print("Author Articles Saved.")


def save_profile_results_to_csv():
    print("Waiting for profile results to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)

    print("Profile Results Saved.")


Join us on Reddit | Twitter | YouTube

Add a Feature Request💫 or a Bug🐞