Exploratory Data Analysis (EDA) Tutorial

Summary

This post download data from Yahoo finance and does some basic EDA.

In [1]:
!conda env list
%matplotlib inline
# conda environments:
#
base                  *  /Users/xianshi/anaconda3
blog                     /Users/xianshi/anaconda3/envs/blog

In [2]:
#Importing The Data


from pandas_datareader import data
import pandas as pd

# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
# tickers = ['W', 'HD', 'AMZN','LOW','LL']
tickers = ['ROIC', 'SKT', 'TCO' ,'SPG' ,'MAC']
# Define which online source one should use
data_source = 'yahoo'

# We would like all available data from 01/01/2000 until 12/31/2016.
start_date = '2016-12-01'
end_date = '2017-12-31'

# User pandas_reader.data.DataReader to load the desired data. As simple as that.
panel_data = data.DataReader(tickers, data_source, start_date, end_date)
In [3]:
panel_data.head()
Out[3]:
Attributes High Low ... Volume Adj Close
Symbols MAC ROIC SKT SPG TCO MAC ROIC SKT SPG TCO ... MAC ROIC SKT SPG TCO MAC ROIC SKT SPG TCO
Date
2016-12-01 68.279999 20.629999 34.669998 179.380005 72.290001 66.809998 20.190001 34.090000 175.869995 71.010002 ... 927500.0 1041700.0 745100.0 2075900.0 355000.0 58.302998 18.167982 29.123262 157.796829 63.129143
2016-12-02 68.690002 20.799999 34.990002 180.789993 72.790001 67.540001 20.370001 34.380001 177.419998 71.459999 ... 676100.0 601200.0 418800.0 1273900.0 250200.0 58.925732 18.436207 29.420441 160.063568 63.632339
2016-12-05 68.839996 20.770000 34.990002 180.809998 72.709999 67.680000 20.549999 34.330002 178.029999 71.739998 ... 931000.0 648500.0 478000.0 1027300.0 245000.0 59.505215 18.534555 29.692146 161.312943 64.100204
2016-12-06 69.610001 21.020000 35.310001 182.139999 73.550003 68.099998 20.590000 34.830002 179.130005 72.209999 ... 915600.0 590800.0 391100.0 1202000.0 269000.0 59.721443 18.597143 29.946865 160.732864 64.877075
2016-12-07 70.830002 21.350000 35.950001 183.759995 75.320000 69.169998 20.860001 35.310001 180.509995 73.660004 ... 810800.0 843500.0 628000.0 1278400.0 406700.0 61.165833 18.954781 30.481783 163.981232 65.768707

5 rows × 30 columns

In [4]:
df = panel_data['Adj Close']
In [5]:
# Basic Description of the Data

df.describe()
Out[5]:
Symbols MAC ROIC SKT SPG TCO
count 272.000000 272.000000 272.000000 272.000000 272.000000
mean 55.118781 18.442282 24.788283 151.898158 55.258763
std 4.208408 0.656658 3.412448 7.387024 6.499111
min 47.703194 16.672773 19.759792 138.740128 42.535595
25% 51.428945 17.981809 21.999384 146.275017 50.471438
50% 55.492500 18.473031 23.172514 150.871696 54.953302
75% 59.109798 18.914228 28.274007 157.328503 60.127033
max 62.869678 20.064341 31.560102 167.898178 67.782234
In [6]:
first = df.head()
last = df.tail()
print(first)
print(last)
Symbols           MAC       ROIC        SKT         SPG        TCO
Date                                                              
2016-12-01  58.302998  18.167982  29.123262  157.796829  63.129143
2016-12-02  58.925732  18.436207  29.420441  160.063568  63.632339
2016-12-05  59.505215  18.534555  29.692146  161.312943  64.100204
2016-12-06  59.721443  18.597143  29.946865  160.732864  64.877075
2016-12-07  61.165833  18.954781  30.481783  163.981232  65.768707
Symbols           MAC       ROIC        SKT         SPG        TCO
Date                                                              
2017-12-22  59.747002  18.089575  22.627144  155.758743  59.732700
2017-12-26  60.232689  18.398554  23.054577  157.893692  60.799690
2017-12-27  60.370136  18.520279  23.428577  159.208237  60.966686
2017-12-28  60.489265  18.735632  23.597769  159.562500  60.911030
2017-12-29  60.186863  18.679453  23.606674  160.112564  60.706913
In [7]:
df.sample(6)
Out[7]:
Symbols MAC ROIC SKT SPG TCO
Date
2017-03-28 56.465996 18.949013 27.943995 151.426422 57.989056
2017-03-10 56.103745 18.396067 26.769661 151.534576 58.936821
2017-04-04 56.819401 19.276661 28.149717 153.688156 59.192852
2017-05-05 55.062401 19.003618 24.801085 148.245621 54.970566
2017-07-13 51.444580 17.836662 22.929958 144.016022 54.236202
2017-03-24 56.430656 19.222054 27.866848 151.336319 58.420258
In [8]:
# A Closer Look At Your Data: Queries

df.query('MAC == ROIC')
Out[8]:
Symbols MAC ROIC SKT SPG TCO
Date
In [9]:
#cleaning
print(df.columns[df.isnull().any()])
Index([], dtype='object', name='Symbols')
In [10]:
# Getting all weekdays between 01/01/2000 and 12/31/2016
all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')

# How do we align the existing prices in adj_close with our new set of dates?
# All we need to do is reindex adj_close using all_weekdays as the new index
df = df.reindex(all_weekdays)

# Reindexing will insert missing values (NaN) for the dates that were not present
# in the original set. To cope with this, we can fill the missing by replacing them
# with the latest available price for each instrument.
df = df.fillna(method='ffill')
df.isnull().head()
Out[10]:
Symbols MAC ROIC SKT SPG TCO
2016-12-01 False False False False False
2016-12-02 False False False False False
2016-12-05 False False False False False
2016-12-06 False False False False False
2016-12-07 False False False False False
In [11]:
# Define your own bins
mybins = range(int(df.MAC.min()), int(df.MAC.max()), 2)

# Cut the data with the help of the bins
df['MAC_bucket'] = pd.cut(df.MAC, bins=mybins)

# Count the number of values per bucket
df['MAC_bucket'].value_counts()
Out[11]:
(51, 53]    62
(59, 61]    61
(57, 59]    46
(49, 51]    35
(55, 57]    26
(53, 55]    21
(47, 49]    17
Name: MAC_bucket, dtype: int64