Summary¶

This post download data from Yahoo finance and does some basic EDA.

In [1]:

!conda env list
%matplotlib inline

# conda environments:
#
base                  *  /Users/xianshi/anaconda3
blog                     /Users/xianshi/anaconda3/envs/blog

In [2]:

#Importing The Data


from pandas_datareader import data
import pandas as pd

# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
# tickers = ['W', 'HD', 'AMZN','LOW','LL']
tickers = ['ROIC', 'SKT', 'TCO' ,'SPG' ,'MAC']
# Define which online source one should use
data_source = 'yahoo'

# We would like all available data from 01/01/2000 until 12/31/2016.
start_date = '2016-12-01'
end_date = '2017-12-31'

# User pandas_reader.data.DataReader to load the desired data. As simple as that.
panel_data = data.DataReader(tickers, data_source, start_date, end_date)

In [3]:

panel_data.head()

Out[3]:

Attributes	High					Low					...	Volume					Adj Close
Symbols	MAC	ROIC	SKT	SPG	TCO	MAC	ROIC	SKT	SPG	TCO	...	MAC	ROIC	SKT	SPG	TCO	MAC	ROIC	SKT	SPG	TCO
Date
2016-12-01	68.279999	20.629999	34.669998	179.380005	72.290001	66.809998	20.190001	34.090000	175.869995	71.010002	...	927500.0	1041700.0	745100.0	2075900.0	355000.0	58.302998	18.167982	29.123262	157.796829	63.129143
2016-12-02	68.690002	20.799999	34.990002	180.789993	72.790001	67.540001	20.370001	34.380001	177.419998	71.459999	...	676100.0	601200.0	418800.0	1273900.0	250200.0	58.925732	18.436207	29.420441	160.063568	63.632339
2016-12-05	68.839996	20.770000	34.990002	180.809998	72.709999	67.680000	20.549999	34.330002	178.029999	71.739998	...	931000.0	648500.0	478000.0	1027300.0	245000.0	59.505215	18.534555	29.692146	161.312943	64.100204
2016-12-06	69.610001	21.020000	35.310001	182.139999	73.550003	68.099998	20.590000	34.830002	179.130005	72.209999	...	915600.0	590800.0	391100.0	1202000.0	269000.0	59.721443	18.597143	29.946865	160.732864	64.877075
2016-12-07	70.830002	21.350000	35.950001	183.759995	75.320000	69.169998	20.860001	35.310001	180.509995	73.660004	...	810800.0	843500.0	628000.0	1278400.0	406700.0	61.165833	18.954781	30.481783	163.981232	65.768707

5 rows × 30 columns

In [4]:

df = panel_data['Adj Close']

In [5]:

# Basic Description of the Data

df.describe()

Out[5]:

Symbols	MAC	ROIC	SKT	SPG	TCO
count	272.000000	272.000000	272.000000	272.000000	272.000000
mean	55.118781	18.442282	24.788283	151.898158	55.258763
std	4.208408	0.656658	3.412448	7.387024	6.499111
min	47.703194	16.672773	19.759792	138.740128	42.535595
25%	51.428945	17.981809	21.999384	146.275017	50.471438
50%	55.492500	18.473031	23.172514	150.871696	54.953302
75%	59.109798	18.914228	28.274007	157.328503	60.127033
max	62.869678	20.064341	31.560102	167.898178	67.782234

In [6]:

first = df.head()
last = df.tail()
print(first)
print(last)

Symbols           MAC       ROIC        SKT         SPG        TCO
Date                                                              
2016-12-01  58.302998  18.167982  29.123262  157.796829  63.129143
2016-12-02  58.925732  18.436207  29.420441  160.063568  63.632339
2016-12-05  59.505215  18.534555  29.692146  161.312943  64.100204
2016-12-06  59.721443  18.597143  29.946865  160.732864  64.877075
2016-12-07  61.165833  18.954781  30.481783  163.981232  65.768707
Symbols           MAC       ROIC        SKT         SPG        TCO
Date                                                              
2017-12-22  59.747002  18.089575  22.627144  155.758743  59.732700
2017-12-26  60.232689  18.398554  23.054577  157.893692  60.799690
2017-12-27  60.370136  18.520279  23.428577  159.208237  60.966686
2017-12-28  60.489265  18.735632  23.597769  159.562500  60.911030
2017-12-29  60.186863  18.679453  23.606674  160.112564  60.706913

In [7]:

df.sample(6)

Out[7]:

Symbols	MAC	ROIC	SKT	SPG	TCO
Date
2017-03-28	56.465996	18.949013	27.943995	151.426422	57.989056
2017-03-10	56.103745	18.396067	26.769661	151.534576	58.936821
2017-04-04	56.819401	19.276661	28.149717	153.688156	59.192852
2017-05-05	55.062401	19.003618	24.801085	148.245621	54.970566
2017-07-13	51.444580	17.836662	22.929958	144.016022	54.236202
2017-03-24	56.430656	19.222054	27.866848	151.336319	58.420258

In [8]:

# A Closer Look At Your Data: Queries

df.query('MAC == ROIC')

Out[8]:

Symbols	MAC	ROIC	SKT	SPG	TCO
Date

In [9]:

#cleaning
print(df.columns[df.isnull().any()])

Index([], dtype='object', name='Symbols')

In [10]:

# Getting all weekdays between 01/01/2000 and 12/31/2016
all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')

# How do we align the existing prices in adj_close with our new set of dates?
# All we need to do is reindex adj_close using all_weekdays as the new index
df = df.reindex(all_weekdays)

# Reindexing will insert missing values (NaN) for the dates that were not present
# in the original set. To cope with this, we can fill the missing by replacing them
# with the latest available price for each instrument.
df = df.fillna(method='ffill')
df.isnull().head()

Out[10]:

Symbols	MAC	ROIC	SKT	SPG	TCO
2016-12-01	False	False	False	False	False
2016-12-02	False	False	False	False	False
2016-12-05	False	False	False	False	False
2016-12-06	False	False	False	False	False
2016-12-07	False	False	False	False	False

In [11]:

# Define your own bins
mybins = range(int(df.MAC.min()), int(df.MAC.max()), 2)

# Cut the data with the help of the bins
df['MAC_bucket'] = pd.cut(df.MAC, bins=mybins)

# Count the number of values per bucket
df['MAC_bucket'].value_counts()

Out[11]:

(51, 53]    62
(59, 61]    61
(57, 59]    46
(49, 51]    35
(55, 57]    26
(53, 55]    21
(47, 49]    17
Name: MAC_bucket, dtype: int64