"""Base IO code for datasets. Heavily influenced by Arviz's (and scikit-learn's) implementation."""
import hashlib
import itertools
import os
import shutil
from collections import namedtuple
from urllib.request import urlretrieve
import pandas as pd
FileMetadata = namedtuple("FileMetadata", ["filename", "url", "checksum", "description"])
DATASETS = {
"my_data": FileMetadata(
filename="my_data.csv",
url="https://ndownloader.figshare.com/files/28850355",
checksum="1bfcdd10d0848c1811e33e467c92734fb488406ef3f9b9aae16a57b258a30fac",
description="""
Toy dataset with one response variable "y" and two covariates "x" and "z".
""",
),
"adults": FileMetadata(
filename="adults.csv",
url="https://ndownloader.figshare.com/files/28870743",
checksum="27a5270ba720125dfb24a7708cbee0218b2ead36248ae244813655d03320e43e",
description="""
A sample with census data from 1994 in United States.
""",
),
"ANES": FileMetadata(
filename="ANES_2016_pilot.csv",
url="https://ndownloader.figshare.com/files/28870740",
checksum="3106beb6ded5a592ea0405d23b868bd8e74c259d7a7f5242c907555692905772",
description="""
The ANES is a nationally representative, cross-sectional survey used extensively in political
science. This is a dataset from the 2016 pilot study, consisting of responses from 1200 voting-age
U.S. citizens.
""",
),
"ESCS": FileMetadata(
filename="ESCS.csv",
url="https://ndownloader.figshare.com/files/28870722",
checksum="0195545797a4258de138a205a013a84022bbe23e7ff47782b179055c706300b6",
description="""
A longitudinal study of hundreds of adults who completed dozens of different self-report and
behavioral measures over the course of 15 years. Among the behavioral measures is an index of
illegal drug use.
""",
),
"carclaims": FileMetadata(
filename="carclaims.csv",
url="https://ndownloader.figshare.com/files/28870713",
checksum="74924bf5f0a6e5aa5453d87845cea05e6b41bb2052cf6f096d7f102235ae5cdf",
description="""
67856 insurance policies and 4624 (6.8%) claims in Australia between 2004 and 2005
""",
),
"batting": FileMetadata(
filename="Batting.csv",
url="https://ndownloader.figshare.com/files/29749140",
checksum="bbbc9459632c738a07bbe0877970a7bbd1f4c2448193979337fe5bc3a4ab0228",
description="""
Baseball Databank is a compilation of historical baseball data in a convenient, tidy format,
distributed under Open Data terms by the Baseball Data Bank.
""",
),
"cherry_blossoms": FileMetadata(
filename="cherry_blossoms.csv",
url="https://figshare.com/ndownloader/files/31072807",
checksum="b859dd4f64c231c76ecb80b78f26da71e2f92698c50e0ceb93be0399dee24f51",
description="""
Historical Series of Phenological data for Cherry Tree Flowering at Kyoto City. Extracted from
the `rethinking` library in R.
""",
),
"sleepstudy": FileMetadata(
filename="sleepstudy.csv",
url="https://figshare.com/ndownloader/files/31181002",
checksum="0a002bec8be2fa9d40dbbf3d5038e614d113a4fd5bf8813f6f4271c3d6294675",
description="""
The average reaction time per day (in milliseconds) for subjects in a sleep deprivation study.
Days 0-1 were adaptation and training (T1/T2), day 2 was baseline (B); sleep deprivation started
after day 2.
Reaction
Average reaction time (ms)
Days
Number of days of sleep deprivation
Subject
Subject number on which the observation was made
""",
),
"periwinkles": FileMetadata(
filename="periwinkles.csv",
url="https://ndownloader.figshare.com/files/34446077",
checksum="50da9791b7a66fbcc9ea4dd828dc7a3a66d5e067faf10f3bfd143af6c590923a",
description="""Data for 31 periwinkles transplanted downshore as a function of the distance
travelled by them after release.""",
),
"admissions": FileMetadata(
filename="admissions.csv",
url="https://figshare.com/ndownloader/files/34757857",
checksum="41e2312ca09d50e99c2db67fbabc78d215df6ce71eefe880df5e9310a9fa8397",
description="""Admission into graduate school data. This dataset has a binary response
variable called 'admit'. There are three predictor variables: 'gre', 'gpa' and 'rank'.""",
),
"bikes": FileMetadata(
filename="bike_sharing.csv",
url="https://figshare.com/ndownloader/files/38737026",
checksum="3e1844b6da435f910b10899e18188568f7d789c715a286c9c6c2ca23833ee7ac",
description="""
This dataset contains the hourly and daily count of rental bikes between
years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal
information.
- instant: record index
- dteday : date
- season : season (1:winter, 2:spring, 3:summer, 4:fall)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from [Web Link])
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit :
* 1: Clear, Few clouds, Partly cloudy, Partly cloudy
* 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
* 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
* 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temperature : Normalized temperature in Celsius.
The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- atemp: Normalized feeling temperature in Celsius.
The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
- humidty: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- count: count of total rental bikes including both casual and registered
Original source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
""",
),
"mtcars": FileMetadata(
filename="mtcars.csv",
url="https://figshare.com/ndownloader/files/40208785",
checksum="c802190c43e02246da9c6c9c3f13a58f076cc6b77922f4d9766a3c6bdb1b52bd",
description="""
The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10
aspects of automobile design and performance for 32 automobiles (1973--74 models).
The following is a description of the variables:
* mpg: Miles/(US) gallon
* cyl: Number of cylinders
* disp: Displacement (cu.in.)
* hp: Gross horsepower
* drat: Rear axle ratio
* wt: Weight (1000 lbs)
* qsec: 1/4 mile time
* vs: Engine (0 = V-shaped, 1 = straight)
* am: Transmission (0 = automatic, 1 = manual)
* gear: Number of forward gears
""",
),
}
def get_data_home(data_home=None):
"""Return the path of the Bambi data dir.
This folder is used to avoid downloading the data several times.
By default the data dir is set to a folder named 'bambi_data' in the user home folder.
Alternatively, it can be set by the ``"BAMBI_DATA"`` environment variable or programmatically by
giving an explicit folder path. The ``"~"`` symbol is expanded to the user home folder. If the
folder does not already exist, it is automatically created.
Parameters
----------
data_home: str
The path to Bambi data dir.
"""
if data_home is None:
data_home = os.environ.get("BAMBI_DATA", os.path.join("~", "bambi_data"))
data_home = os.path.expanduser(data_home)
if not os.path.exists(data_home):
os.makedirs(data_home)
return data_home
[docs]def clear_data_home(data_home=None):
"""Delete all the content of the data home cache.
Parameters
----------
data_home: str
The path to Bambi data dir. By default a folder named ``"bambi_data"`` in the user home
folder.
"""
data_home = get_data_home(data_home)
shutil.rmtree(data_home)
def _sha256(path):
"""Calculate the sha256 hash of the file at path."""
sha256hash = hashlib.sha256()
chunk_size = 8192
with open(path, "rb") as buff:
while True:
buffer = buff.read(chunk_size)
if not buffer:
break
sha256hash.update(buffer)
return sha256hash.hexdigest()
[docs]def load_data(dataset=None, data_home=None):
"""Load a dataset.
Run with no parameters to get a list of all available data sets.
The directory to save can also be set with the environment variable ``BAMBI_HOME``.
The checksum of the dataset is checked against a hardcoded value to watch for data corruption.
Run ``bmb.clear_data_home()`` to clear the data directory.
Parameters
----------
dataset: str
Name of dataset to load.
data_home: str, optional
Where to save remote datasets
Returns
-------
pandas.DataFrame
"""
home_dir = get_data_home(data_home=data_home)
if dataset in DATASETS:
datafile = DATASETS[dataset]
file_path = os.path.join(home_dir, datafile.filename)
if not os.path.exists(file_path):
urlretrieve(datafile.url, file_path)
checksum = _sha256(file_path)
if datafile.checksum != checksum:
raise IOError(
f"{file_path} has an SHA256 checksum ({checksum}) differing from expected "
f"({datafile.checksum}), file may be corrupted. Run `bambi.clear_data_home()` "
"and try again, or please open an issue."
)
return pd.read_csv(file_path)
else:
if dataset is None:
return _list_datasets(home_dir)
else:
raise ValueError(
f"Dataset {dataset} not found! "
f"The following are available:\n{_list_datasets(home_dir)}"
)
def _list_datasets(home_dir):
"""Get a string representation of all available datasets with descriptions."""
lines = []
for filename, resource in itertools.chain(DATASETS.items()):
file_path = os.path.join(home_dir, filename)
if not os.path.exists(file_path):
location = f"location: {resource.url}"
else:
location = f"location: {file_path}"
lines.append(f"{filename}\n{'=' * len(filename)}\n{resource.description}\n{location}")
return f"\n\n{10 * '-'}\n\n".join(lines)