Skip to content

Commit

Permalink
Create fred.py
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelv2 authored Jun 13, 2018
1 parent 59ae8b0 commit a9c71fb
Showing 1 changed file with 392 additions and 0 deletions.
392 changes: 392 additions & 0 deletions fred.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,392 @@
import os
import sys
if sys.version_info[0] >= 3:
from urllib.request import urlopen
from urllib.parse import quote_plus
from urllib.parse import urlencode
from urllib.error import HTTPError
else:
from urllib2 import urlopen
from urllib2 import HTTPError
from urllib import quote_plus
from urllib import urlencode

import xml.etree.ElementTree as ET
import pandas as pd

class Fred(object):
earliest_realtime_start = '1776-07-04'
latest_realtime_end = '9999-12-31'
nan_char = '.'
max_results_per_request = 1000
def __init__(self,
api_key='',
api_key_file=None):
"""
Initialize the Fred class that provides useful functions to query the Fred dataset. You need to specify a valid
API key in one of 3 ways: pass the string via api_key, or set api_key_file to a file with the api key in the
first line, or set the environment variable 'FRED_API_KEY' to the value of your api key. You can sign up for a
free api key on the Fred website at http://research.stlouisfed.org/fred2/
"""
self.api_key = None
if api_key is not None:
self.api_key = api_key
elif api_key_file is not None:
f = open(api_key_file, 'r')
self.api_key = f.readline().strip()
f.close()
else:
self.api_key = os.environ.get('FRED_API_KEY')

if self.api_key is None:
raise ValueError("You need to set a valid API key. You can set it in 3 ways: pass the string with api_key, "
"or set api_key_file to a file with the api key in the first line, or set the environment "
"variable 'FRED_API_KEY' to the value of your api key. You can sign up for a free api key "
"on the Fred website at http://research.stlouisfed.org/fred2/")

def __fetch_data(self, url):
"""
helper function for fetching data given a request URL
"""
try:
response = urlopen(url)
root = ET.fromstring(response.read())
except HTTPError as exc:
root = ET.fromstring(exc.read())
raise ValueError(root.get('message'))
return root

def _parse(self, date_str, format='%Y-%m-%d'):
"""
helper function for parsing FRED date string into datetime
"""
return pd.to_datetime(date_str, format=format).to_datetime()

def get_series_info(self, series_id):
"""
Get information about a series such as its title, frequency, observation start/end dates, units, notes, etc.
Parameters
----------
series_id : str
Fred series id such as 'CPIAUCSL'
Returns
-------
info : Series
a pandas Series containing information about the Fred series
"""
url = "http://api.stlouisfed.org/fred/series?series_id=%s&api_key=%s" % (series_id, self.api_key)
root = self.__fetch_data(url)
if root is None:
raise ValueError('No info exists for series id: ' + series_id)
info = pd.Series(root.getchildren()[0].attrib)
return info

def get_series(self, series_id, observation_start=None, observation_end=None, **kwargs):
"""
Get data for a Fred series id. This fetches the latest known data, and is equivalent to get_series_latest_release()
Parameters
----------
series_id : str
Fred series id such as 'CPIAUCSL'
observation_start : datetime or datetime-like str such as '7/1/2014', optional
earliest observation date
observation_end : datetime or datetime-like str such as '7/1/2014', optional
latest observation date
kwargs : additional parameters
Any additional parameters supported by FRED. You can see http://api.stlouisfed.org/docs/fred/series_observations.html for the full list
Returns
-------
data : Series
a Series where each index is the observation date and the value is the data for the Fred series
"""
url = "http://api.stlouisfed.org/fred/series/observations?series_id=%s&api_key=%s" % (series_id, self.api_key)

if observation_start is not None:
observation_start = pd.to_datetime(observation_start, errors='raise')
url += '&observation_start=' + observation_start.strftime('%Y-%m-%d')
if observation_end is not None:
observation_end = pd.to_datetime(observation_end, errors='raise')
url += '&observation_end=' + observation_end.strftime('%Y-%m-%d')

if kwargs is not None:
url += '&' + urlencode(kwargs)

root = self.__fetch_data(url)
if root is None:
raise ValueError('No data exists for series id: ' + series_id)
data = {}
for child in root.getchildren():
val = child.get('value')
if val == self.nan_char:
val = float('NaN')
else:
val = float(val)
data[self._parse(child.get('date'))] = val
return pd.Series(data)

def get_series_latest_release(self, series_id):
"""
Get data for a Fred series id. This fetches the latest known data, and is equivalent to get_series()
Parameters
----------
series_id : str
Fred series id such as 'CPIAUCSL'
Returns
-------
info : Series
a Series where each index is the observation date and the value is the data for the Fred series
"""
return self.get_series(series_id)

def get_series_first_release(self, series_id):
"""
Get first-release data for a Fred series id. This ignores any revision to the data series. For instance,
The US GDP for Q1 2014 was first released to be 17149.6, and then later revised to 17101.3, and 17016.0.
This will ignore revisions after the first release.
Parameters
----------
series_id : str
Fred series id such as 'GDP'
Returns
-------
data : Series
a Series where each index is the observation date and the value is the data for the Fred series
"""
df = self.get_series_all_releases(series_id)
first_release = df.groupby('date').head(1)
data = first_release.set_index('date')['value']
return data

def get_series_as_of_date(self, series_id, as_of_date):
"""
Get latest data for a Fred series id as known on a particular date. This includes any revision to the data series
before or on as_of_date, but ignores any revision on dates after as_of_date.
Parameters
----------
series_id : str
Fred series id such as 'GDP'
as_of_date : datetime, or datetime-like str such as '10/25/2014'
Include data revisions on or before this date, and ignore revisions afterwards
Returns
-------
data : Series
a Series where each index is the observation date and the value is the data for the Fred series
"""
as_of_date = pd.to_datetime(as_of_date)
df = self.get_series_all_releases(series_id)
data = df[df['realtime_start'] <= as_of_date]
return data

def get_series_all_releases(self, series_id):
"""
Get all data for a Fred series id including first releases and all revisions. This returns a DataFrame
with three columns: 'date', 'realtime_start', and 'value'. For instance, the US GDP for Q4 2013 was first released
to be 17102.5 on 2014-01-30, and then revised to 17080.7 on 2014-02-28, and then revised to 17089.6 on
2014-03-27. You will therefore get three rows with the same 'date' (observation date) of 2013-10-01 but three
different 'realtime_start' of 2014-01-30, 2014-02-28, and 2014-03-27 with corresponding 'value' of 17102.5, 17080.7
and 17089.6
Parameters
----------
series_id : str
Fred series id such as 'GDP'
Returns
-------
data : DataFrame
a DataFrame with columns 'date', 'realtime_start' and 'value' where 'date' is the observation period and 'realtime_start'
is when the corresponding value (either first release or revision) is reported.
"""
url = "http://api.stlouisfedorg/fred/series/observations?series_id=%s&api_key=%s&realtime_start=%s&realtime_end=%s" % (series_id,
self.api_key,
self.earliest_realtime_start,
self.latest_realtime_end)
root = self.__fetch_data(url)
if root is None:
raise ValueError('No data exists for series id: ' + series_id)
data = {}
i = 0
for child in root.getchildren():
val = child.get('value')
if val == self.nan_char:
val = float('NaN')
else:
val = float(val)
realtime_start = self._parse(child.get('realtime_start'))
# realtime_end = self._parse(child.get('realtime_end'))
date = self._parse(child.get('date'))

data[i] = {'realtime_start': realtime_start,
# 'realtime_end': realtime_end,
'date': date,
'value': val}
i += 1
data = pd.DataFrame(data).T
return data

def get_series_vintage_dates(self, series_id):
"""
Get a list of vintage dates for a series. Vintage dates are the dates in history when a
series' data values were revised or new data values were released.
Parameters
----------
series_id : str
Fred series id such as 'CPIAUCSL'
Returns
-------
dates : list
list of vintage dates
"""
url = "http://api.stlouisfed.org/fred/series/vintagedates?series_id=%s&api_key=%s" % (series_id, self.api_key)
root = self.__fetch_data(url)
if root is None:
raise ValueError('No vintage date exists for series id: ' + series_id)
dates = []
for child in root.getchildren():
dates.append(self._parse(child.text))
return dates

def __do_series_search(self, url):
"""
helper function for making one HTTP request for data, and parsing the returned results into a DataFrame
"""
root = self.__fetch_data(url)

series_ids = []
data = {}

num_results_returned = 0 # number of results returned in this HTTP request
num_results_total = int(root.get('count')) # total number of results, this can be larger than number of results returned
for child in root.getchildren():
num_results_returned += 1
series_id = child.get('id')
series_ids.append(series_id)
data[series_id] = {"id": series_id}
fields = ["realtime_start", "realtime_end", "title", "observation_start", "observation_end",
"frequency", "frequency_short", "units", "units_short", "seasonal_adjustment",
"seasonal_adjustment_short", "last_updated", "popularity", "notes"]
for field in fields:
data[series_id][field] = child.get(field)

if num_results_returned > 0:
data = pd.DataFrame(data, columns=series_ids).T
# parse datetime columns
for field in ["realtime_start", "realtime_end", "observation_start", "observation_end", "last_updated"]:
data[field] = data[field].apply(self._parse, format=None)
# set index name
data.index.name = 'series id'
else:
data = None
return data, num_results_total

def __get_search_results(self, url, limit, order_by, sort_order):
"""
helper function for getting search results up to specified limit on the number of results. The Fred HTTP API
truncates to 1000 results per request, so this may issue multiple HTTP requests to obtain more available data.
"""

order_by_options = ['search_rank', 'series_id', 'title', 'units', 'frequency',
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated',
'observation_start', 'observation_end', 'popularity']
if order_by is not None:
if order_by in order_by_options:
url = url + '&order_by=' + order_by
else:
raise ValueError('%s is not in the valid list of order_by options: %s' % (order_by, str(order_by_options)))

sort_order_options = ['asc', 'desc']
if sort_order is not None:
if sort_order in sort_order_options:
url = url + '&sort_order=' + sort_order
else:
raise ValueError('%s is not in the valid list of sort_order options: %s' % (sort_order, str(sort_order_options)))

data, num_results_total = self.__do_series_search(url)
if data is None:
return data

if limit == 0:
max_results_needed = num_results_total
else:
max_results_needed = limit

if max_results_needed > self.max_results_per_request:
for i in range(1, max_results_needed // self.max_results_per_request + 1):
offset = i * self.max_results_per_request
next_data, _ = self.__do_series_search(url + '&offset=' + str(offset))
data = data.append(next_data)
return data.head(max_results_needed)

def search(self, text, limit=1000, order_by=None, sort_order=None):
"""
Do a fulltext search for series in the Fred dataset. Returns information about matching series in a DataFrame.
Parameters
----------
text : str
text to do fulltext search on, e.g., 'Real GDP'
limit : int, optional
limit the number of results to this value. If limit is 0, it means fetching all results without limit.
order_by : str, optional
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency',
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end',
'popularity'
sort_order : str, optional
sort the results by ascending or descending order. Valid options are 'asc' or 'desc'
Returns
-------
info : DataFrame
a DataFrame containing information about the matching Fred series
"""
url = "http://api.stlouisfed.org/fred/series/search?search_text=%s&api_key=%s" % (quote_plus(text), self.api_key)
info = self.__get_search_results(url, limit, order_by, sort_order)
return info

def search_by_release(self, release_id, limit=0, order_by=None, sort_order=None):
"""
Search for series that belongs to a release id. Returns information about matching series in a DataFrame.
Parameters
----------
release_id : int
release id, e.g., 151
limit : int, optional
limit the number of results to this value. If limit is 0, it means fetching all results without limit.
order_by : str, optional
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency',
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end',
'popularity'
sort_order : str, optional
sort the results by ascending or descending order. Valid options are 'asc' or 'desc'
Returns
-------
info : DataFrame
a DataFrame containing information about the matching Fred series
"""
url = "http://api.stlouisfed.org/fred/release/series?release_id=%d&&api_key=%s" % (release_id, self.api_key)
info = self.__get_search_results(url, limit, order_by, sort_order)
if info is None:
raise ValueError('No series exists for release id: ' + str(release_id))
return info

def search_by_category(self, category_id, limit=0, order_by=None, sort_order=None):
"""
Search for series that belongs to a category id. Returns information about matching series in a DataFrame.
Parameters
----------
category_id : int
category id, e.g., 32145
limit : int, optional
limit the number of results to this value. If limit is 0, it means fetching all results without limit.
order_by : str, optional
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency',
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end',
'popularity'
sort_order : str, optional
sort the results by ascending or descending order. Valid options are 'asc' or 'desc'
Returns
-------
info : DataFrame
a DataFrame containing information about the matching Fred series
"""
url = "http://api.stlouisfed.org/fred/category/series?category_id=%d&api_key=%s" % (category_id, self.api_key)
info = self.__get_search_results(url, limit, order_by, sort_order)
if info is None:
raise ValueError('No series exists for category id: ' + str(category_id))
return info

0 comments on commit a9c71fb

Please sign in to comment.