Sample Code

Get started in Python, R, or Stata. Copy-paste and run.

Load a parquet file

import pandas as pd

# Load a single ticker (downloaded from hfdatalibrary.com)
df = pd.read_parquet("AAPL_clean.parquet")
print(f"Rows: {len(df):,}")
print(df.head())

Query via API

import requests
import pandas as pd
from io import StringIO

API_KEY = "your-key-here"
BASE = "https://api.hfdatalibrary.com/v1"

# Get 1-minute bars for AAPL, January 2024, clean version
r = requests.get(
    f"{BASE}/bars/AAPL",
    headers={"X-API-Key": API_KEY},
    params={
        "start": "2024-01-02",
        "end": "2024-01-31",
        "version": "clean",
        "format": "csv"
    }
)
df = pd.read_csv(StringIO(r.text), parse_dates=["datetime"])
print(f"Bars: {len(df):,}")
print(df.head())

Compute realized volatility

import numpy as np

# 5-minute sampled realized variance
df["log_return"] = np.log(df["Close"] / df["Close"].shift(1))

# Sample every 5 minutes
df_5min = df[df["datetime"].dt.minute % 5 == 0].copy()
df_5min["log_return"] = np.log(df_5min["Close"] / df_5min["Close"].shift(1))

# Daily RV
daily_rv = df_5min.groupby(df_5min["datetime"].dt.date)["log_return"].apply(
    lambda x: (x ** 2).sum()
)
print("Daily realized variance (5-min):")
print(daily_rv.head(10))

Compare cleaning versions

# Load all three versions
raw = pd.read_parquet("AAPL_raw.parquet")
clean = pd.read_parquet("AAPL_clean.parquet")
filled = pd.read_parquet("AAPL_filled.parquet")

print(f"Raw:    {len(raw):>12,} bars")
print(f"Clean:  {len(clean):>12,} bars")
print(f"Filled: {len(filled):>12,} bars")

# Check how many bars were filled
filled_count = filled["is_filled"].sum()
print(f"Filled bars: {filled_count:,} ({filled_count/len(filled)*100:.1f}%)")

BNS jump detection

import numpy as np

def bns_jump_test(returns):
    """BNS (2006) jump test. Returns z-statistic."""
    r = returns.dropna().values
    n = len(r)
    if n < 10:
        return np.nan

    # Realized variance
    rv = np.sum(r ** 2)

    # Bipower variation
    bv = (np.pi / 2) * np.sum(np.abs(r[1:]) * np.abs(r[:-1]))

    # Quarticity
    qv = (n / 3) * np.sum(r[1:] ** 2 * r[:-1] ** 2)

    # Theta for finite-sample correction
    theta = max(1, qv * n / bv ** 2)

    # Z-statistic
    z = (rv - bv) / np.sqrt(theta * (2 / n) * bv ** 2)
    return z

# Apply to daily groups
df["log_return"] = np.log(df["Close"] / df["Close"].shift(1))
daily_z = df.groupby(df["datetime"].dt.date)["log_return"].apply(bns_jump_test)
jump_days = (daily_z > 2.326).sum()  # 1% significance
print(f"Jump days (1% level): {jump_days}")

Load a parquet file

library(arrow)

# Load a single ticker
df <- read_parquet("AAPL_clean.parquet")
cat(sprintf("Rows: %s\n", format(nrow(df), big.mark=",")))
head(df)

Query via API

library(httr)
library(readr)

api_key <- "your-key-here"
base_url <- "https://api.hfdatalibrary.com/v1"

resp <- GET(
  paste0(base_url, "/bars/AAPL"),
  add_headers("X-API-Key" = api_key),
  query = list(
    start = "2024-01-02",
    end = "2024-01-31",
    version = "clean",
    format = "csv"
  )
)

df <- read_csv(content(resp, "text"))
cat(sprintf("Bars: %s\n", format(nrow(df), big.mark=",")))
head(df)

Compute realized volatility

library(dplyr)

df <- df %>%
  mutate(
    log_return = log(Close / lag(Close)),
    date = as.Date(datetime)
  )

# Daily realized variance (1-min)
daily_rv <- df %>%
  group_by(date) %>%
  summarise(rv_1min = sum(log_return^2, na.rm = TRUE))

head(daily_rv, 10)

Load data (via CSV)

* Download CSV from the API or convert parquet to CSV first
import delimited "AAPL_clean.csv", clear
describe
summarize Close Volume

Compute realized volatility

* Generate log returns
gen log_return = ln(Close / Close[_n-1])

* Generate date variable
gen date = dofc(datetime)
format date %td

* Daily realized variance
collapse (sum) rv = log_return_sq, by(date)
gen log_return_sq = log_return^2
list in 1/10

Note: Stata does not natively read parquet files. Use the API with format=csv, or convert parquet to CSV using Python (pd.read_parquet("file.parquet").to_csv("file.csv")).