{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Summary\n", "This post download data from Yahoo finance and does some basic EDA." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# conda environments:\r\n", "#\r\n", "base * /Users/xianshi/anaconda3\r\n", "blog /Users/xianshi/anaconda3/envs/blog\r\n", "\r\n" ] } ], "source": [ "!conda env list\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#Importing The Data\n", "\n", "\n", "from pandas_datareader import data\n", "import pandas as pd\n", "\n", "# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.\n", "# tickers = ['W', 'HD', 'AMZN','LOW','LL']\n", "tickers = ['ROIC', 'SKT', 'TCO' ,'SPG' ,'MAC']\n", "# Define which online source one should use\n", "data_source = 'yahoo'\n", "\n", "# We would like all available data from 01/01/2000 until 12/31/2016.\n", "start_date = '2016-12-01'\n", "end_date = '2017-12-31'\n", "\n", "# User pandas_reader.data.DataReader to load the desired data. As simple as that.\n", "panel_data = data.DataReader(tickers, data_source, start_date, end_date)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AttributesHighLow...VolumeAdj Close
SymbolsMACROICSKTSPGTCOMACROICSKTSPGTCO...MACROICSKTSPGTCOMACROICSKTSPGTCO
Date
2016-12-0168.27999920.62999934.669998179.38000572.29000166.80999820.19000134.090000175.86999571.010002...927500.01041700.0745100.02075900.0355000.058.30299818.16798229.123262157.79682963.129143
2016-12-0268.69000220.79999934.990002180.78999372.79000167.54000120.37000134.380001177.41999871.459999...676100.0601200.0418800.01273900.0250200.058.92573218.43620729.420441160.06356863.632339
2016-12-0568.83999620.77000034.990002180.80999872.70999967.68000020.54999934.330002178.02999971.739998...931000.0648500.0478000.01027300.0245000.059.50521518.53455529.692146161.31294364.100204
2016-12-0669.61000121.02000035.310001182.13999973.55000368.09999820.59000034.830002179.13000572.209999...915600.0590800.0391100.01202000.0269000.059.72144318.59714329.946865160.73286464.877075
2016-12-0770.83000221.35000035.950001183.75999575.32000069.16999820.86000135.310001180.50999573.660004...810800.0843500.0628000.01278400.0406700.061.16583318.95478130.481783163.98123265.768707
\n", "

5 rows × 30 columns

\n", "
" ], "text/plain": [ "Attributes High Low \\\n", "Symbols MAC ROIC SKT SPG TCO MAC \n", "Date \n", "2016-12-01 68.279999 20.629999 34.669998 179.380005 72.290001 66.809998 \n", "2016-12-02 68.690002 20.799999 34.990002 180.789993 72.790001 67.540001 \n", "2016-12-05 68.839996 20.770000 34.990002 180.809998 72.709999 67.680000 \n", "2016-12-06 69.610001 21.020000 35.310001 182.139999 73.550003 68.099998 \n", "2016-12-07 70.830002 21.350000 35.950001 183.759995 75.320000 69.169998 \n", "\n", "Attributes ... Volume \\\n", "Symbols ROIC SKT SPG TCO ... MAC \n", "Date ... \n", "2016-12-01 20.190001 34.090000 175.869995 71.010002 ... 927500.0 \n", "2016-12-02 20.370001 34.380001 177.419998 71.459999 ... 676100.0 \n", "2016-12-05 20.549999 34.330002 178.029999 71.739998 ... 931000.0 \n", "2016-12-06 20.590000 34.830002 179.130005 72.209999 ... 915600.0 \n", "2016-12-07 20.860001 35.310001 180.509995 73.660004 ... 810800.0 \n", "\n", "Attributes Adj Close \\\n", "Symbols ROIC SKT SPG TCO MAC ROIC \n", "Date \n", "2016-12-01 1041700.0 745100.0 2075900.0 355000.0 58.302998 18.167982 \n", "2016-12-02 601200.0 418800.0 1273900.0 250200.0 58.925732 18.436207 \n", "2016-12-05 648500.0 478000.0 1027300.0 245000.0 59.505215 18.534555 \n", "2016-12-06 590800.0 391100.0 1202000.0 269000.0 59.721443 18.597143 \n", "2016-12-07 843500.0 628000.0 1278400.0 406700.0 61.165833 18.954781 \n", "\n", "Attributes \n", "Symbols SKT SPG TCO \n", "Date \n", "2016-12-01 29.123262 157.796829 63.129143 \n", "2016-12-02 29.420441 160.063568 63.632339 \n", "2016-12-05 29.692146 161.312943 64.100204 \n", "2016-12-06 29.946865 160.732864 64.877075 \n", "2016-12-07 30.481783 163.981232 65.768707 \n", "\n", "[5 rows x 30 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "panel_data.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = panel_data['Adj Close']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SymbolsMACROICSKTSPGTCO
count272.000000272.000000272.000000272.000000272.000000
mean55.11878118.44228224.788283151.89815855.258763
std4.2084080.6566583.4124487.3870246.499111
min47.70319416.67277319.759792138.74012842.535595
25%51.42894517.98180921.999384146.27501750.471438
50%55.49250018.47303123.172514150.87169654.953302
75%59.10979818.91422828.274007157.32850360.127033
max62.86967820.06434131.560102167.89817867.782234
\n", "
" ], "text/plain": [ "Symbols MAC ROIC SKT SPG TCO\n", "count 272.000000 272.000000 272.000000 272.000000 272.000000\n", "mean 55.118781 18.442282 24.788283 151.898158 55.258763\n", "std 4.208408 0.656658 3.412448 7.387024 6.499111\n", "min 47.703194 16.672773 19.759792 138.740128 42.535595\n", "25% 51.428945 17.981809 21.999384 146.275017 50.471438\n", "50% 55.492500 18.473031 23.172514 150.871696 54.953302\n", "75% 59.109798 18.914228 28.274007 157.328503 60.127033\n", "max 62.869678 20.064341 31.560102 167.898178 67.782234" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Basic Description of the Data\n", "\n", "df.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Symbols MAC ROIC SKT SPG TCO\n", "Date \n", "2016-12-01 58.302998 18.167982 29.123262 157.796829 63.129143\n", "2016-12-02 58.925732 18.436207 29.420441 160.063568 63.632339\n", "2016-12-05 59.505215 18.534555 29.692146 161.312943 64.100204\n", "2016-12-06 59.721443 18.597143 29.946865 160.732864 64.877075\n", "2016-12-07 61.165833 18.954781 30.481783 163.981232 65.768707\n", "Symbols MAC ROIC SKT SPG TCO\n", "Date \n", "2017-12-22 59.747002 18.089575 22.627144 155.758743 59.732700\n", "2017-12-26 60.232689 18.398554 23.054577 157.893692 60.799690\n", "2017-12-27 60.370136 18.520279 23.428577 159.208237 60.966686\n", "2017-12-28 60.489265 18.735632 23.597769 159.562500 60.911030\n", "2017-12-29 60.186863 18.679453 23.606674 160.112564 60.706913\n" ] } ], "source": [ "first = df.head()\n", "last = df.tail()\n", "print(first)\n", "print(last)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SymbolsMACROICSKTSPGTCO
Date
2017-03-2856.46599618.94901327.943995151.42642257.989056
2017-03-1056.10374518.39606726.769661151.53457658.936821
2017-04-0456.81940119.27666128.149717153.68815659.192852
2017-05-0555.06240119.00361824.801085148.24562154.970566
2017-07-1351.44458017.83666222.929958144.01602254.236202
2017-03-2456.43065619.22205427.866848151.33631958.420258
\n", "
" ], "text/plain": [ "Symbols MAC ROIC SKT SPG TCO\n", "Date \n", "2017-03-28 56.465996 18.949013 27.943995 151.426422 57.989056\n", "2017-03-10 56.103745 18.396067 26.769661 151.534576 58.936821\n", "2017-04-04 56.819401 19.276661 28.149717 153.688156 59.192852\n", "2017-05-05 55.062401 19.003618 24.801085 148.245621 54.970566\n", "2017-07-13 51.444580 17.836662 22.929958 144.016022 54.236202\n", "2017-03-24 56.430656 19.222054 27.866848 151.336319 58.420258" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sample(6)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SymbolsMACROICSKTSPGTCO
Date
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [MAC, ROIC, SKT, SPG, TCO]\n", "Index: []" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# A Closer Look At Your Data: Queries\n", "\n", "df.query('MAC == ROIC')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index([], dtype='object', name='Symbols')\n" ] } ], "source": [ "#cleaning\n", "print(df.columns[df.isnull().any()])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SymbolsMACROICSKTSPGTCO
2016-12-01FalseFalseFalseFalseFalse
2016-12-02FalseFalseFalseFalseFalse
2016-12-05FalseFalseFalseFalseFalse
2016-12-06FalseFalseFalseFalseFalse
2016-12-07FalseFalseFalseFalseFalse
\n", "
" ], "text/plain": [ "Symbols MAC ROIC SKT SPG TCO\n", "2016-12-01 False False False False False\n", "2016-12-02 False False False False False\n", "2016-12-05 False False False False False\n", "2016-12-06 False False False False False\n", "2016-12-07 False False False False False" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Getting all weekdays between 01/01/2000 and 12/31/2016\n", "all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')\n", "\n", "# How do we align the existing prices in adj_close with our new set of dates?\n", "# All we need to do is reindex adj_close using all_weekdays as the new index\n", "df = df.reindex(all_weekdays)\n", "\n", "# Reindexing will insert missing values (NaN) for the dates that were not present\n", "# in the original set. To cope with this, we can fill the missing by replacing them\n", "# with the latest available price for each instrument.\n", "df = df.fillna(method='ffill')\n", "df.isnull().head()\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(51, 53] 62\n", "(59, 61] 61\n", "(57, 59] 46\n", "(49, 51] 35\n", "(55, 57] 26\n", "(53, 55] 21\n", "(47, 49] 17\n", "Name: MAC_bucket, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Define your own bins\n", "mybins = range(int(df.MAC.min()), int(df.MAC.max()), 2)\n", "\n", "# Cut the data with the help of the bins\n", "df['MAC_bucket'] = pd.cut(df.MAC, bins=mybins)\n", "\n", "# Count the number of values per bucket\n", "df['MAC_bucket'].value_counts()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" }, "nikola": { "category": "", "date": "2018-02-05 21:48:16 UTC-05:00", "description": "", "link": "", "slug": "exploratory-data-analysis-eda-tutorial", "tags": "", "title": "Exploratory Data Analysis (EDA) Tutorial", "type": "text" } }, "nbformat": 4, "nbformat_minor": 2 }