Calibre Web Scraping Update
Stupid security
One of the recent updates to Calibre Web (I am on v0.6.14 now) introduced some security to the login in the form of a csfr token. I had no idea why my script wouldn’t work anymore until I started taking it all apart.
What I ended up having to do was use Beautiful Soup to load the html content of the login page, find the csfr_token and then add it to the login data. I also added in the cookies, although frankly by that time I wasn’t sure if it was necessary or not. But it works so I left it.
Just a note, if any of you have found this page wanting to scrape Calibre-Web, if you search the site for the tag calibre you find the whole journey
The current python script
#! /usr/local/bin/python3
# coding: utf-8
# python script to import shelfs of books from Calibre-Web
# and save them as a markdown file
# import various libraries
import requests
from bs4 import BeautifulSoup
import re
import os
# enable sys.exit()
import sys
# set header to avoid being labeled a bot
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# set base url
urlpath = 'https://MY_CALIBREWEB_URL'
# calibre-web login data
login_data = {
'next': '/',
'username': 'MY_USERNAME',
'password': 'MY_PASSWORD',
'remember_me': 'on',
}
# login as a session
with requests.Session() as sess:
res = sess.get(urlpath + '/login', headers=headers)
login = BeautifulSoup(res._content, 'html.parser')
# find security token and extract value
csrf_token = login.find('input', {'name': 'csrf_token'})
csrf_token = (csrf_token.attrs['value'])
# print(csrf_token)
# append securtiy token to login dictionary
login_data['csrf_token'] = csrf_token
# login with token
res = sess.post(urlpath + '/login', data=login_data, cookies=res.cookies)
# print(res.text)
# set path to export as markdown file
path_folder = "FOLDER_PATH_MOUNTED_BY_SHELL_SCRIPT"
# check ifserver is already mounted and change path if so
if os.path.isdir(path_folder):
print('valid drive')
elif os.path.isdir("/Volumes/www/home/books/"):
path_folder = "/Volumes/www/home/books/"
else:
path_folder = "/Volumes/www-1/home/books/"
# open the file
file = open(path_folder+"bookfile.md", "w")
# Set Title
file.write("# Books Read since 2012\n")
# print("# Books Read\n")
# Set intro blurb
file.write("This is an automatically generated list of books scraped from my Calibre ebook library ( Making A “Books Read” Page ). As a result it does not include any paper books I may have read as they do not exist in that library.\n\nI update it regularly and finally went back and added all the previous years. Links to previous years’ book count posts: \n- 2012 (85)\n- 2013 (95)\n- 2014 (106)\n- 2015 (92)\n- 2016 (101)\n- 2017 (120)\n- 2018 (142)\n- 2019 (123)\n- 2020 (112)\n")
# find list of shelves
shelfhtml = sess.get(urlpath)
soup = BeautifulSoup(shelfhtml.text, "html.parser")
shelflist = soup.find_all('a', href=re.compile('/shelf/[1-9]'))
print(shelflist)
# reverse order of urllist
dateshelflist = (shelflist)
dateshelflist.reverse()
print(dateshelflist)
# loop through sorted shelves
for shelf in dateshelflist:
# set shelf page url
res = sess.get(urlpath+shelf.get('href'))
soup = BeautifulSoup(res.text, "html.parser")
# find year from shelflist and format
shelfyear = soup.find('h2')
year = re.search("([0-9]{4})", shelfyear.text)
year.group()
file.write("\n### {}\n".format(year.group()))
# print("### {}\n".format(year.group()))
# find all books
books = soup.find_all('div', class_='col-sm-3 col-lg-2 col-xs-6 book')
# loop though books. Each book is a new BeautifulSoup object.
for book in books:
title = book.find('p', class_='title')
# print(title)
author = book.find('a', class_='author-name')
#print (author)
seriesnamea = book.find('p', id='series') # I have to manually add this id to the shelf.html template
seriesname = (seriesnamea.text if seriesnamea else "").replace(
" ", "").replace("(", " Book ").replace(")", "").replace("\n", "")
if (seriesname != ""):
seriesname = "*" + seriesname + "*"
#print (seriesname)
# NOTE: pubdate is custom added by me to /templates/shelf and won't work in a standard install
pubdate = book.find('p', class_='series', id='pubdate')
if pubdate:
#print (pubdate)
# extract year from pubdate
pubyear = re.search("([0-9]{4})", pubdate.text)
pubyear.group()
pubyear = pubyear.group()
else:
pubyear = "n/a"
# construct line using markdown
newstring = "* ***{}*** — {} ({})\n{} – ebook\n".format(
title.text, author.text, pubyear.group(), seriesname)
#print (newstring)
file.write(newstring)
file.close()
NOTE—July 18, 2023 Added some new code from various updates (see calibre updates)