-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
392 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,392 @@ | ||
import os | ||
import sys | ||
if sys.version_info[0] >= 3: | ||
from urllib.request import urlopen | ||
from urllib.parse import quote_plus | ||
from urllib.parse import urlencode | ||
from urllib.error import HTTPError | ||
else: | ||
from urllib2 import urlopen | ||
from urllib2 import HTTPError | ||
from urllib import quote_plus | ||
from urllib import urlencode | ||
|
||
import xml.etree.ElementTree as ET | ||
import pandas as pd | ||
|
||
class Fred(object): | ||
earliest_realtime_start = '1776-07-04' | ||
latest_realtime_end = '9999-12-31' | ||
nan_char = '.' | ||
max_results_per_request = 1000 | ||
def __init__(self, | ||
api_key='', | ||
api_key_file=None): | ||
""" | ||
Initialize the Fred class that provides useful functions to query the Fred dataset. You need to specify a valid | ||
API key in one of 3 ways: pass the string via api_key, or set api_key_file to a file with the api key in the | ||
first line, or set the environment variable 'FRED_API_KEY' to the value of your api key. You can sign up for a | ||
free api key on the Fred website at http://research.stlouisfed.org/fred2/ | ||
""" | ||
self.api_key = None | ||
if api_key is not None: | ||
self.api_key = api_key | ||
elif api_key_file is not None: | ||
f = open(api_key_file, 'r') | ||
self.api_key = f.readline().strip() | ||
f.close() | ||
else: | ||
self.api_key = os.environ.get('FRED_API_KEY') | ||
|
||
if self.api_key is None: | ||
raise ValueError("You need to set a valid API key. You can set it in 3 ways: pass the string with api_key, " | ||
"or set api_key_file to a file with the api key in the first line, or set the environment " | ||
"variable 'FRED_API_KEY' to the value of your api key. You can sign up for a free api key " | ||
"on the Fred website at http://research.stlouisfed.org/fred2/") | ||
|
||
def __fetch_data(self, url): | ||
""" | ||
helper function for fetching data given a request URL | ||
""" | ||
try: | ||
response = urlopen(url) | ||
root = ET.fromstring(response.read()) | ||
except HTTPError as exc: | ||
root = ET.fromstring(exc.read()) | ||
raise ValueError(root.get('message')) | ||
return root | ||
|
||
def _parse(self, date_str, format='%Y-%m-%d'): | ||
""" | ||
helper function for parsing FRED date string into datetime | ||
""" | ||
return pd.to_datetime(date_str, format=format).to_datetime() | ||
|
||
def get_series_info(self, series_id): | ||
""" | ||
Get information about a series such as its title, frequency, observation start/end dates, units, notes, etc. | ||
Parameters | ||
---------- | ||
series_id : str | ||
Fred series id such as 'CPIAUCSL' | ||
Returns | ||
------- | ||
info : Series | ||
a pandas Series containing information about the Fred series | ||
""" | ||
url = "http://api.stlouisfed.org/fred/series?series_id=%s&api_key=%s" % (series_id, self.api_key) | ||
root = self.__fetch_data(url) | ||
if root is None: | ||
raise ValueError('No info exists for series id: ' + series_id) | ||
info = pd.Series(root.getchildren()[0].attrib) | ||
return info | ||
|
||
def get_series(self, series_id, observation_start=None, observation_end=None, **kwargs): | ||
""" | ||
Get data for a Fred series id. This fetches the latest known data, and is equivalent to get_series_latest_release() | ||
Parameters | ||
---------- | ||
series_id : str | ||
Fred series id such as 'CPIAUCSL' | ||
observation_start : datetime or datetime-like str such as '7/1/2014', optional | ||
earliest observation date | ||
observation_end : datetime or datetime-like str such as '7/1/2014', optional | ||
latest observation date | ||
kwargs : additional parameters | ||
Any additional parameters supported by FRED. You can see http://api.stlouisfed.org/docs/fred/series_observations.html for the full list | ||
Returns | ||
------- | ||
data : Series | ||
a Series where each index is the observation date and the value is the data for the Fred series | ||
""" | ||
url = "http://api.stlouisfed.org/fred/series/observations?series_id=%s&api_key=%s" % (series_id, self.api_key) | ||
|
||
if observation_start is not None: | ||
observation_start = pd.to_datetime(observation_start, errors='raise') | ||
url += '&observation_start=' + observation_start.strftime('%Y-%m-%d') | ||
if observation_end is not None: | ||
observation_end = pd.to_datetime(observation_end, errors='raise') | ||
url += '&observation_end=' + observation_end.strftime('%Y-%m-%d') | ||
|
||
if kwargs is not None: | ||
url += '&' + urlencode(kwargs) | ||
|
||
root = self.__fetch_data(url) | ||
if root is None: | ||
raise ValueError('No data exists for series id: ' + series_id) | ||
data = {} | ||
for child in root.getchildren(): | ||
val = child.get('value') | ||
if val == self.nan_char: | ||
val = float('NaN') | ||
else: | ||
val = float(val) | ||
data[self._parse(child.get('date'))] = val | ||
return pd.Series(data) | ||
|
||
def get_series_latest_release(self, series_id): | ||
""" | ||
Get data for a Fred series id. This fetches the latest known data, and is equivalent to get_series() | ||
Parameters | ||
---------- | ||
series_id : str | ||
Fred series id such as 'CPIAUCSL' | ||
Returns | ||
------- | ||
info : Series | ||
a Series where each index is the observation date and the value is the data for the Fred series | ||
""" | ||
return self.get_series(series_id) | ||
|
||
def get_series_first_release(self, series_id): | ||
""" | ||
Get first-release data for a Fred series id. This ignores any revision to the data series. For instance, | ||
The US GDP for Q1 2014 was first released to be 17149.6, and then later revised to 17101.3, and 17016.0. | ||
This will ignore revisions after the first release. | ||
Parameters | ||
---------- | ||
series_id : str | ||
Fred series id such as 'GDP' | ||
Returns | ||
------- | ||
data : Series | ||
a Series where each index is the observation date and the value is the data for the Fred series | ||
""" | ||
df = self.get_series_all_releases(series_id) | ||
first_release = df.groupby('date').head(1) | ||
data = first_release.set_index('date')['value'] | ||
return data | ||
|
||
def get_series_as_of_date(self, series_id, as_of_date): | ||
""" | ||
Get latest data for a Fred series id as known on a particular date. This includes any revision to the data series | ||
before or on as_of_date, but ignores any revision on dates after as_of_date. | ||
Parameters | ||
---------- | ||
series_id : str | ||
Fred series id such as 'GDP' | ||
as_of_date : datetime, or datetime-like str such as '10/25/2014' | ||
Include data revisions on or before this date, and ignore revisions afterwards | ||
Returns | ||
------- | ||
data : Series | ||
a Series where each index is the observation date and the value is the data for the Fred series | ||
""" | ||
as_of_date = pd.to_datetime(as_of_date) | ||
df = self.get_series_all_releases(series_id) | ||
data = df[df['realtime_start'] <= as_of_date] | ||
return data | ||
|
||
def get_series_all_releases(self, series_id): | ||
""" | ||
Get all data for a Fred series id including first releases and all revisions. This returns a DataFrame | ||
with three columns: 'date', 'realtime_start', and 'value'. For instance, the US GDP for Q4 2013 was first released | ||
to be 17102.5 on 2014-01-30, and then revised to 17080.7 on 2014-02-28, and then revised to 17089.6 on | ||
2014-03-27. You will therefore get three rows with the same 'date' (observation date) of 2013-10-01 but three | ||
different 'realtime_start' of 2014-01-30, 2014-02-28, and 2014-03-27 with corresponding 'value' of 17102.5, 17080.7 | ||
and 17089.6 | ||
Parameters | ||
---------- | ||
series_id : str | ||
Fred series id such as 'GDP' | ||
Returns | ||
------- | ||
data : DataFrame | ||
a DataFrame with columns 'date', 'realtime_start' and 'value' where 'date' is the observation period and 'realtime_start' | ||
is when the corresponding value (either first release or revision) is reported. | ||
""" | ||
url = "http://api.stlouisfedorg/fred/series/observations?series_id=%s&api_key=%s&realtime_start=%s&realtime_end=%s" % (series_id, | ||
self.api_key, | ||
self.earliest_realtime_start, | ||
self.latest_realtime_end) | ||
root = self.__fetch_data(url) | ||
if root is None: | ||
raise ValueError('No data exists for series id: ' + series_id) | ||
data = {} | ||
i = 0 | ||
for child in root.getchildren(): | ||
val = child.get('value') | ||
if val == self.nan_char: | ||
val = float('NaN') | ||
else: | ||
val = float(val) | ||
realtime_start = self._parse(child.get('realtime_start')) | ||
# realtime_end = self._parse(child.get('realtime_end')) | ||
date = self._parse(child.get('date')) | ||
|
||
data[i] = {'realtime_start': realtime_start, | ||
# 'realtime_end': realtime_end, | ||
'date': date, | ||
'value': val} | ||
i += 1 | ||
data = pd.DataFrame(data).T | ||
return data | ||
|
||
def get_series_vintage_dates(self, series_id): | ||
""" | ||
Get a list of vintage dates for a series. Vintage dates are the dates in history when a | ||
series' data values were revised or new data values were released. | ||
Parameters | ||
---------- | ||
series_id : str | ||
Fred series id such as 'CPIAUCSL' | ||
Returns | ||
------- | ||
dates : list | ||
list of vintage dates | ||
""" | ||
url = "http://api.stlouisfed.org/fred/series/vintagedates?series_id=%s&api_key=%s" % (series_id, self.api_key) | ||
root = self.__fetch_data(url) | ||
if root is None: | ||
raise ValueError('No vintage date exists for series id: ' + series_id) | ||
dates = [] | ||
for child in root.getchildren(): | ||
dates.append(self._parse(child.text)) | ||
return dates | ||
|
||
def __do_series_search(self, url): | ||
""" | ||
helper function for making one HTTP request for data, and parsing the returned results into a DataFrame | ||
""" | ||
root = self.__fetch_data(url) | ||
|
||
series_ids = [] | ||
data = {} | ||
|
||
num_results_returned = 0 # number of results returned in this HTTP request | ||
num_results_total = int(root.get('count')) # total number of results, this can be larger than number of results returned | ||
for child in root.getchildren(): | ||
num_results_returned += 1 | ||
series_id = child.get('id') | ||
series_ids.append(series_id) | ||
data[series_id] = {"id": series_id} | ||
fields = ["realtime_start", "realtime_end", "title", "observation_start", "observation_end", | ||
"frequency", "frequency_short", "units", "units_short", "seasonal_adjustment", | ||
"seasonal_adjustment_short", "last_updated", "popularity", "notes"] | ||
for field in fields: | ||
data[series_id][field] = child.get(field) | ||
|
||
if num_results_returned > 0: | ||
data = pd.DataFrame(data, columns=series_ids).T | ||
# parse datetime columns | ||
for field in ["realtime_start", "realtime_end", "observation_start", "observation_end", "last_updated"]: | ||
data[field] = data[field].apply(self._parse, format=None) | ||
# set index name | ||
data.index.name = 'series id' | ||
else: | ||
data = None | ||
return data, num_results_total | ||
|
||
def __get_search_results(self, url, limit, order_by, sort_order): | ||
""" | ||
helper function for getting search results up to specified limit on the number of results. The Fred HTTP API | ||
truncates to 1000 results per request, so this may issue multiple HTTP requests to obtain more available data. | ||
""" | ||
|
||
order_by_options = ['search_rank', 'series_id', 'title', 'units', 'frequency', | ||
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', | ||
'observation_start', 'observation_end', 'popularity'] | ||
if order_by is not None: | ||
if order_by in order_by_options: | ||
url = url + '&order_by=' + order_by | ||
else: | ||
raise ValueError('%s is not in the valid list of order_by options: %s' % (order_by, str(order_by_options))) | ||
|
||
sort_order_options = ['asc', 'desc'] | ||
if sort_order is not None: | ||
if sort_order in sort_order_options: | ||
url = url + '&sort_order=' + sort_order | ||
else: | ||
raise ValueError('%s is not in the valid list of sort_order options: %s' % (sort_order, str(sort_order_options))) | ||
|
||
data, num_results_total = self.__do_series_search(url) | ||
if data is None: | ||
return data | ||
|
||
if limit == 0: | ||
max_results_needed = num_results_total | ||
else: | ||
max_results_needed = limit | ||
|
||
if max_results_needed > self.max_results_per_request: | ||
for i in range(1, max_results_needed // self.max_results_per_request + 1): | ||
offset = i * self.max_results_per_request | ||
next_data, _ = self.__do_series_search(url + '&offset=' + str(offset)) | ||
data = data.append(next_data) | ||
return data.head(max_results_needed) | ||
|
||
def search(self, text, limit=1000, order_by=None, sort_order=None): | ||
""" | ||
Do a fulltext search for series in the Fred dataset. Returns information about matching series in a DataFrame. | ||
Parameters | ||
---------- | ||
text : str | ||
text to do fulltext search on, e.g., 'Real GDP' | ||
limit : int, optional | ||
limit the number of results to this value. If limit is 0, it means fetching all results without limit. | ||
order_by : str, optional | ||
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency', | ||
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end', | ||
'popularity' | ||
sort_order : str, optional | ||
sort the results by ascending or descending order. Valid options are 'asc' or 'desc' | ||
Returns | ||
------- | ||
info : DataFrame | ||
a DataFrame containing information about the matching Fred series | ||
""" | ||
url = "http://api.stlouisfed.org/fred/series/search?search_text=%s&api_key=%s" % (quote_plus(text), self.api_key) | ||
info = self.__get_search_results(url, limit, order_by, sort_order) | ||
return info | ||
|
||
def search_by_release(self, release_id, limit=0, order_by=None, sort_order=None): | ||
""" | ||
Search for series that belongs to a release id. Returns information about matching series in a DataFrame. | ||
Parameters | ||
---------- | ||
release_id : int | ||
release id, e.g., 151 | ||
limit : int, optional | ||
limit the number of results to this value. If limit is 0, it means fetching all results without limit. | ||
order_by : str, optional | ||
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency', | ||
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end', | ||
'popularity' | ||
sort_order : str, optional | ||
sort the results by ascending or descending order. Valid options are 'asc' or 'desc' | ||
Returns | ||
------- | ||
info : DataFrame | ||
a DataFrame containing information about the matching Fred series | ||
""" | ||
url = "http://api.stlouisfed.org/fred/release/series?release_id=%d&&api_key=%s" % (release_id, self.api_key) | ||
info = self.__get_search_results(url, limit, order_by, sort_order) | ||
if info is None: | ||
raise ValueError('No series exists for release id: ' + str(release_id)) | ||
return info | ||
|
||
def search_by_category(self, category_id, limit=0, order_by=None, sort_order=None): | ||
""" | ||
Search for series that belongs to a category id. Returns information about matching series in a DataFrame. | ||
Parameters | ||
---------- | ||
category_id : int | ||
category id, e.g., 32145 | ||
limit : int, optional | ||
limit the number of results to this value. If limit is 0, it means fetching all results without limit. | ||
order_by : str, optional | ||
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency', | ||
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end', | ||
'popularity' | ||
sort_order : str, optional | ||
sort the results by ascending or descending order. Valid options are 'asc' or 'desc' | ||
Returns | ||
------- | ||
info : DataFrame | ||
a DataFrame containing information about the matching Fred series | ||
""" | ||
url = "http://api.stlouisfed.org/fred/category/series?category_id=%d&api_key=%s" % (category_id, self.api_key) | ||
info = self.__get_search_results(url, limit, order_by, sort_order) | ||
if info is None: | ||
raise ValueError('No series exists for category id: ' + str(category_id)) | ||
return info |