import numpy as np
import pandas as pd


data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


data.values

array([0.25, 0.5 , 0.75, 1.  ])


type(_)

numpy.ndarray


data.index

RangeIndex(start=0, stop=4, step=1)


print(pd.RangeIndex.__doc__)

    Immutable Index implementing a monotonic integer range.

    RangeIndex is a memory-saving special case of Int64Index limited to
    representing monotonic ranges. Using RangeIndex may in some instances
    improve computing speed.

    This is the default index type used
    by DataFrame and Series when no explicit index is provided by the user.

    Parameters
    ----------
    start : int (default: 0), or other RangeIndex instance
        If int and "stop" is not given, interpreted as "stop" instead.
    stop : int (default: 0)
    step : int (default: 1)
    dtype : np.int64
        Unused, accepted for homogeneity with other index types.
    copy : bool, default False
        Unused, accepted for homogeneity with other index types.
    name : object, optional
        Name to be stored in the index.

    Attributes
    ----------
    start
    stop
    step

    Methods
    -------
    from_range

    See Also
    --------
    Index : The base pandas Index type.
    Int64Index : Index of int64 data.


data[1]

0.5


data[1:3]

1    0.50
2    0.75
dtype: float64


type(_)

pandas.core.series.Series


data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


data['b'] # item access works as expected

0.5


data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64


data[5]

0.5


population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64


type(_)

pandas.core.series.Series


population.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')


population['California'] # typical dictionary-style item access

38332521


population['California':'Illinois'] # array-like slicing

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64


area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


states = pd.DataFrame({'population': population, 'area': area})
states


type(_)

pandas.core.frame.DataFrame


states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')


states.columns

Index(['population', 'area'], dtype='object')


type(_)

pandas.core.indexes.base.Index


states['area'] # "feature"

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


pd.DataFrame(population, columns=['population'])


data = [{'a': i, 'b': 2 * i} for i in range(3)]
data

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]


pd.DataFrame(data)


pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]) #  Pandas will fill missing keys with ``NaN``


np.random.rand(3, 2)

array([[0.30282887, 0.48376433],
       [0.53588853, 0.97428136],
       [0.94756199, 0.46766408]])


pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])


A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])


pd.DataFrame(A)


ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')


ind[1]

3


ind[::2]

Int64Index([2, 5, 11], dtype='int64')


ind.size, ind.shape, ind.ndim, ind.dtype,

(5, (5,), 1, dtype('int64'))


ind[1] = 0

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-60-906a9fa1424c> in <module>
----> 1 ind[1] = 0

~/Developer/py-venvs/sphinx-venv/lib/python3.9/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
   4275     @final
   4276     def __setitem__(self, key, value):
-> 4277         raise TypeError("Index does not support mutable operations")
   4278 
   4279     def __getitem__(self, key):

TypeError: Index does not support mutable operations


indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])


indA.intersection(indB)  # intersection

Int64Index([3, 5, 7], dtype='int64')


indA.union(indB)  # union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')


indA.symmetric_difference(indB)  # symmetric difference

Int64Index([1, 2, 9, 11], dtype='int64')


data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


data['b'] # mnemonic indexing

0.5


'a' in data # dictionary-like Python expressions...

True


data.keys() # ...and methods.

Index(['a', 'b', 'c', 'd'], dtype='object')


list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64


data['a':'c'] # slicing by explicit index

a    0.25
b    0.50
c    0.75
dtype: float64


data[0:2] # slicing by implicit integer index

a    0.25
b    0.50
dtype: float64


data[(data > 0.3) & (data < 0.8)] # masking

b    0.50
c    0.75
dtype: float64


(data > 0.3) & (data < 0.8)

a    False
b     True
c     True
d    False
e    False
dtype: bool


type(_)

pandas.core.series.Series


data[['a', 'e']] # fancy indexing

a    0.25
e    1.25
dtype: float64


data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object


data[1] # explicit index when indexing

'a'


data[1:3] # implicit index when slicing

3    b
5    c
dtype: object


data.loc[1]

'a'


data.loc[1:3]

1    a
3    b
dtype: object


data.iloc[1:3]

3    b
5    c
dtype: object


area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})


data = pd.DataFrame({'area':area, 'pop':pop})
data


data['area'] # columns can be accessed via dict-style indexing

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


data.area # alternatively, use attribute-style access with column names

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


data['density'] = data['pop'] / data['area']
data


data.values # examine the raw underlying data array

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])


data.values.T

array([[4.23967000e+05, 6.95662000e+05, 1.41297000e+05, 1.70312000e+05,
        1.49995000e+05],
       [3.83325210e+07, 2.64481930e+07, 1.96511270e+07, 1.95528600e+07,
        1.28821350e+07],
       [9.04139261e+01, 3.80187404e+01, 1.39076746e+02, 1.14806121e+02,
        8.58837628e+01]])


type(_)

numpy.ndarray


data.T # transpose the full DataFrame object


type(_)

pandas.core.frame.DataFrame


data.values[0] # passing a single index to an array accesses a row

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])


data['area'] # assing a single "index" to access a column

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


data.iloc[:3, :2]


data.loc[:'Illinois', :'pop']


data.loc[data.density > 100, ['pop', 'density']]


data.iloc[0, 2] = 90
data


data['Florida':'Illinois'] # *slicing* refers to rows


data[data.density > 100] # direct masking operations are also interpreted row-wise


rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64


rng.randint(0, 10, (3, 4))

array([[1, 7, 5, 1],
       [4, 0, 9, 5],
       [8, 0, 9, 2]])


df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'])
df


np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64


type(_)

pandas.core.series.Series


np.sin(df * np.pi / 4) # a slightly more complex calculation


type(_)

pandas.core.frame.DataFrame


area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population')


population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64


area.index.union(population.index) # this does create a new index and doesn't modify in place.

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')


area.index

Index(['Alaska', 'Texas', 'California'], dtype='object')


A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64


A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64


A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB'))
A


B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
B


A + B


fill = A.stack().mean()
fill

9.75


A.add(B, fill_value=fill)


A = rng.randint(10, size=(3, 4))
A

array([[3, 8, 2, 4],
       [2, 6, 4, 8],
       [6, 1, 3, 8]])


type(A)

numpy.ndarray


A - A[0]

array([[ 0,  0,  0,  0],
       [-1, -2,  2,  4],
       [ 3, -7,  1,  4]])


df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]


df.subtract(df['R'], axis=0)


vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)


for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype = object
81.8 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
1.87 ms ± 34.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


vals1.sum()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-94-30a3fc8c6726> in <module>
----> 1 vals1.sum()

~/Developer/venvs/py-ml/lib/python3.8/site-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     45 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     46          initial=_NoValue, where=True):
---> 47     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     48 
     49 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'


vals2 = np.array([1, np.nan, 3, 4]) 
vals2.dtype

dtype('float64')


1 + np.nan, 0 *  np.nan

(nan, nan)


vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)


np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)


pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64


data = pd.Series([1, np.nan, 'hello', None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool


data.dropna()

0        1
2    hello
dtype: object


df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df


df.dropna() # drop all rows in which *any* null value is present


df.dropna(axis='columns') # drop NA values from all columns containing a null value


df[3] = np.nan
df


df.dropna(axis='columns', how='all')


df.dropna(axis='rows', thresh=3)


data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64


data.fillna(0) # fill NA entries with a single value

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64


data.fillna(method='ffill') # specify a forward-fill to propagate the previous value forward

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64


data.fillna(method='bfill') # specify a back-fill to propagate the next values backward

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

df


df.fillna(method='ffill', axis=1)


index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64


pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64


index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )


type(_)

pandas.core.indexes.multi.MultiIndex


pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64


pop_df = pop.unstack()
pop_df


type(pop_df)

pandas.core.frame.DataFrame


pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df


f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()


df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df


data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64


pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], names=['subject', 'type'])

data = np.round(np.random.randn(4, 6), 1) # mock some data
data[:, ::2] *= 10
data += 37

health_data = pd.DataFrame(data, index=index, columns=columns)
health_data # create the DataFrame


health_data['Guido']

pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


pop['California', 2000] # access single elements by indexing with multiple terms

33871648


pop['California']

year
2000    33871648
2010    37253956
dtype: int64


pop[pop > 22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64


pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64


health_data


health_data['Guido', 'HR']

year  visit
2013  1        19.0
      2        37.0
2014  1        52.0
      2        46.0
Name: (Guido, HR), dtype: float64


health_data.iloc[:2, :2]


health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        48.0
      2        34.0
2014  1        41.0
      2        47.0
Name: (Bob, HR), dtype: float64


index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.002105
      2      0.280923
c     1      0.008604
      2      0.631968
b     1      0.072270
      2      0.273800
dtype: float64


try:
    data['a':'b'] # try to take a partial slice of this index
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


data = data.sort_index()
data

char  int
a     1      0.002105
      2      0.280923
b     1      0.072270
      2      0.273800
c     1      0.008604
      2      0.631968
dtype: float64


data['a':'b']

char  int
a     1      0.002105
      2      0.280923
b     1      0.072270
      2      0.273800
dtype: float64


pop.unstack(level=0)


pop.unstack(level=1)


pop.unstack().stack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


pop_flat = pop.reset_index(name='population') # specify the name of the data for the column
pop_flat


pop_flat.set_index(['state', 'year'])


health_data


data_mean = health_data.mean(level='year')
data_mean


data_mean.mean(axis=1, level='type')


def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))


class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)


ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object


df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])')


df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
display('df3', 'df4', "pd.concat([df3, df4], axis=1)")


x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index  # make duplicate indices!
display('x', 'y', 'pd.concat([x, y])')


try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


display('x', 'y', 'pd.concat([x, y], ignore_index=True)')


display('x', 'y', "pd.concat([x, y], keys=['x', 'y'])")


df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
display('df5', 'df6', 'pd.concat([df5, df6])')


display('df5', 'df6', "pd.concat([df5, df6])")


display('df1', 'df2', 'df1.append(df2)')


class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)


df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
display('df1', 'df2')


df3 = pd.merge(df1, df2)
df3


df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
display('df3', 'df4', 'pd.merge(df3, df4)')


df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
display('df1', 'df5', "pd.merge(df1, df5)")


display('df1', 'df2', "pd.merge(df1, df2, on='employee')")


df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
display('df1', 'df3', 'pd.merge(df1, df3, left_on="employee", right_on="name")')


pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)


df1.set_index?


df1a = df1.set_index([ 'group', 'employee'])
df1a


df1a = df1.set_index(['employee', 'group'])
df2a = df2.set_index('employee')
display('df1a', 'df2a')


display('df1a', 'df2a', "pd.merge(df1a, df2a, left_index=True, right_index=True)")


df1a, df2a, pd.merge(df1a, df2a, left_index=True, right_index=True)

(                group
 employee             
 Bob        Accounting
 Jake      Engineering
 Lisa      Engineering
 Sue                HR,
           hire_date
 employee           
 Lisa           2004
 Bob            2008
 Jake           2012
 Sue            2014,
                 group  hire_date
 employee                        
 Bob        Accounting       2008
 Jake      Engineering       2012
 Lisa      Engineering       2004
 Sue                HR       2014)


display('df1a', 'df2a', 'df1a.join(df2a)')


display('df1a', 'df3', "pd.merge(df1a, df3, left_index=True, right_on='name')")


df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'], 'food': ['fish', 'beans', 'bread']},
                   columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'], 'drink': ['wine', 'beer']},
                   columns=['name', 'drink'])
display('df6', 'df7', 'pd.merge(df6, df7)')


pd.merge(df6, df7, how='inner')


display('df6', 'df7', "pd.merge(df6, df7, how='outer')")


display('df6', 'df7', "pd.merge(df6, df7, how='left')")


display('df6', 'df7', "pd.merge(df6, df7, how='right')")


pd.merge?


df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
display('df8', 'df9', 'pd.merge(df8, df9, on="name")')


display('df8', 'df9', 'pd.merge(df8, df9, on="name", suffixes=["_L", "_R"])')


pop = pd.read_csv('data/state-population.csv')
areas = pd.read_csv('data/state-areas.csv')
abbrevs = pd.read_csv('data/state-abbrevs.csv')

display('pop.head()', 'areas.head()', 'abbrevs.head()')


merged = pd.merge(pop, abbrevs, how='outer',
                  left_on='state/region', right_on='abbreviation')
merged = merged.drop('abbreviation', 1) # drop duplicate info
merged.head()


merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool


merged[merged['population'].isnull()].head()


merged.loc[merged['state'].isnull(), 'state/region'].unique()

array(['PR', 'USA'], dtype=object)


merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool


final = pd.merge(merged, areas, on='state', how='left')
final.head()


final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool


final['state'][final['area (sq. mi)'].isnull()].unique()

array(['United States'], dtype=object)


final.dropna(inplace=True)
final.head()


data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()


data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']


density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64


density.tail()

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64


class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)


import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape # 1,000+ extrasolar planets discovered up to 2014.

---------------------------------------------------------------------------
SSLCertVerificationError                  Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1341             try:
-> 1342                 h.request(req.get_method(), req.selector, req.data, headers,
   1343                           encode_chunked=req.has_header('Transfer-encoding'))

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py in request(self, method, url, body, headers, encode_chunked)
   1254         """Send a complete request to the server."""
-> 1255         self._send_request(method, url, body, headers, encode_chunked)
   1256 

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1300             body = _encode(body, 'body')
-> 1301         self.endheaders(body, encode_chunked=encode_chunked)
   1302 

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py in endheaders(self, message_body, encode_chunked)
   1249             raise CannotSendHeader()
-> 1250         self._send_output(message_body, encode_chunked=encode_chunked)
   1251 

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py in _send_output(self, message_body, encode_chunked)
   1009         del self._buffer[:]
-> 1010         self.send(msg)
   1011 

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py in send(self, data)
    949             if self.auto_open:
--> 950                 self.connect()
    951             else:

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py in connect(self)
   1423 
-> 1424             self.sock = self._context.wrap_socket(self.sock,
   1425                                                   server_hostname=server_hostname)

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
    499         # ctx._wrap_socket()
--> 500         return self.sslsocket_class._create(
    501             sock=sock,

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
   1039                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1040                     self.do_handshake()
   1041             except (OSError, ValueError):

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ssl.py in do_handshake(self, block)
   1308                 self.settimeout(None)
-> 1309             self._sslobj.do_handshake()
   1310         finally:

SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1122)

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-155-c51edcc88343> in <module>
      1 import seaborn as sns
----> 2 planets = sns.load_dataset('planets')
      3 planets.shape # 1,000+ extrasolar planets discovered up to 2014.

~/Developer/py-venvs/sphinx-venv/lib/python3.9/site-packages/seaborn/utils.py in load_dataset(name, cache, data_home, **kws)
    483                                   os.path.basename(full_path))
    484         if not os.path.exists(cache_path):
--> 485             if name not in get_dataset_names():
    486                 raise ValueError(f"'{name}' is not one of the example datasets.")
    487             urlretrieve(full_path, cache_path)

~/Developer/py-venvs/sphinx-venv/lib/python3.9/site-packages/seaborn/utils.py in get_dataset_names()
    417     """
    418     url = "https://github.com/mwaskom/seaborn-data"
--> 419     with urlopen(url) as resp:
    420         html = resp.read()
    421 

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    212     else:
    213         opener = _opener
--> 214     return opener.open(url, data, timeout)
    215 
    216 def install_opener(opener):

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py in open(self, fullurl, data, timeout)
    515 
    516         sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 517         response = self._open(req, data)
    518 
    519         # post-process response

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py in _open(self, req, data)
    532 
    533         protocol = req.type
--> 534         result = self._call_chain(self.handle_open, protocol, protocol +
    535                                   '_open', req)
    536         if result:

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    492         for handler in handlers:
    493             func = getattr(handler, meth_name)
--> 494             result = func(*args)
    495             if result is not None:
    496                 return result

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py in https_open(self, req)
   1383 
   1384         def https_open(self, req):
-> 1385             return self.do_open(http.client.HTTPSConnection, req,
   1386                 context=self._context, check_hostname=self._check_hostname)
   1387 

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1343                           encode_chunked=req.has_header('Transfer-encoding'))
   1344             except OSError as err: # timeout error
-> 1345                 raise URLError(err)
   1346             r = h.getresponse()
   1347         except:

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1122)>


planets.head()


rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64


ser.sum(), ser.mean()

(2.811925491708157, 0.5623850983416314)


df = pd.DataFrame({'A': rng.rand(5), 'B': rng.rand(5)})
df


df.mean()

A    0.347115
B    0.373185
dtype: float64


df.mean(axis='columns')

0    0.397629
1    0.221868
2    0.408451
3    0.399153
4    0.373650
dtype: float64


planets.dropna().describe() # dropping rows with missing values


df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df


df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fbb19adca30>


df.groupby('key').sum()


type(_)

pandas.core.frame.DataFrame


planets.groupby('method')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f864d921470>


planets.groupby('method')['orbital_period']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f864d921898>


planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64


for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


planets.groupby('method')['year'].describe()


rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df


df.groupby('key').aggregate([min, np.median, max])


def filter_func(x):
    return x['data2'].std() > 4

display('df', "df.groupby('key').std()", "df.groupby('key').filter(filter_func)")


df.groupby('key').transform(lambda x: x - x.mean())


def norm_by_data2(x):
    # x is a DataFrame of group values
    x['data1'] /= x['data2'].sum()
    return x

display('df', "df.groupby('key').apply(norm_by_data2)")


L = [0, 1, 0, 1, 2, 0]
display('df', 'df.groupby(L).sum()')


display('df', "df.groupby(df['key']).sum()")


df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
display('df2', 'df2.groupby(mapping).sum()')


display('df2', 'df2.groupby(str.lower).mean()')


df2.groupby([str.lower, mapping]).mean()


decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)


import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()


titanic.groupby('sex')[['survived']].mean()


titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()


titanic.pivot_table('survived', index='sex', columns='class')


age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')


fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])


titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived':sum, 'fare':'mean'})


titanic.pivot_table('survived', index='sex', columns='class', margins=True)


births = pd.read_csv('data/births.csv')
births.describe()


births.head()


births['decade'] = 10 * (births['year'] // 10)
births.pivot_table('births', index='decade', columns='gender', aggfunc='sum')


%matplotlib inline
import matplotlib.pyplot as plt
sns.set()  # use Seaborn styles
births.pivot_table('births', index='year', columns='gender', aggfunc='sum').plot()
plt.ylabel('total births per year');


quartiles = np.percentile(births['births'], [25, 50, 75])
mu = quartiles[1]
sig = 0.74 * (quartiles[2] - quartiles[0])


births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')


# set 'day' column to integer; it originally was a string due to nulls
births['day'] = births['day'].astype(int)


# create a datetime index from the year, month, day
births.index = pd.to_datetime(10000 * births.year +
                              100 * births.month +
                              births.day, format='%Y%m%d')

births['dayofweek'] = births.index.dayofweek


import matplotlib.pyplot as plt
import matplotlib as mpl

births.pivot_table('births', index='dayofweek',
                    columns='decade', aggfunc='mean').plot()
plt.gca().set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean births by day');


births_by_date = births.pivot_table('births', [births.index.month, births.index.day])
births_by_date.head()


from datetime import datetime
births_by_date.index = [datetime(2012, month, day)
                        for (month, day) in births_by_date.index]
births_by_date.head()


fig, ax = plt.subplots(figsize=(12, 4))
births_by_date.plot(ax=ax);


from datetime import datetime
datetime(year=2015, month=7, day=4)

datetime.datetime(2015, 7, 4, 0, 0)


from dateutil import parser
date = parser.parse("4th of July, 2015")
date

datetime.datetime(2015, 7, 4, 0, 0)


date.strftime('%A')

'Saturday'


date = np.array('2015-07-04', dtype=np.datetime64)
date

array('2015-07-04', dtype='datetime64[D]')


date + np.arange(12)

array(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
       '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
       '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
      dtype='datetime64[D]')


np.datetime64('2015-07-04') # a day-based datetime

numpy.datetime64('2015-07-04')


np.datetime64('2015-07-04 12:00') # a minute-based datetime

numpy.datetime64('2015-07-04T12:00')


np.datetime64('2015-07-04 12:59:59.50', 'ns') # a nanosecond-based datetiem

numpy.datetime64('2015-07-04T12:59:59.500000000')


date = pd.to_datetime("4th of July, 2015")
date

Timestamp('2015-07-04 00:00:00')


date.strftime('%A')

'Saturday'


date + pd.to_timedelta(np.arange(12), 'D') # do NumPy-style vectorized operations directly on this same object

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)


index = pd.DatetimeIndex(['2014-07-04', '2014-08-04', '2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
data

2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64


data['2014-07-04':'2015-07-04']

2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64


data['2015'] # additional special date-only indexing operations

2015-07-04    2
2015-08-04    3
dtype: int64


dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                       '2015-Jul-6', '07-07-2015', '20150708'])
dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)


dates.to_period('D')

PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='period[D]', freq='D')


dates - dates[0] # A ``TimedeltaIndex`` is createdwhen subtracting

TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)


pd.date_range('2015-07-03', '2015-07-10') # by default, the frequency is one day

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')


pd.date_range('2015-07-03', periods=8)

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')


pd.date_range('2015-07-03', periods=8, freq='H')

DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')


pd.period_range('2015-07', periods=8, freq='M')

PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
             '2016-01', '2016-02'],
            dtype='period[M]', freq='M')


pd.timedelta_range(0, periods=10, freq='H')

TimedeltaIndex(['00:00:00', '01:00:00', '02:00:00', '03:00:00', '04:00:00',
                '05:00:00', '06:00:00', '07:00:00', '08:00:00', '09:00:00'],
               dtype='timedelta64[ns]', freq='H')


pd.timedelta_range(0, periods=9, freq="2H30T")

TimedeltaIndex(['00:00:00', '02:30:00', '05:00:00', '07:30:00', '10:00:00',
                '12:30:00', '15:00:00', '17:30:00', '20:00:00'],
               dtype='timedelta64[ns]', freq='150T')


from pandas.tseries.offsets import BDay
pd.date_range('2015-07-01', periods=5, freq=BDay())

DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-06',
               '2015-07-07'],
              dtype='datetime64[ns]', freq='B')


from pandas_datareader import data

goog = data.DataReader('VIXCLS', start='2004', end='2016', data_source='fred')
goog.head()


goog = goog['VIXCLS'] # for simplicity consider its sole column


%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
goog.plot();


goog.plot(alpha=0.5, style='-')
goog.resample('BA').mean().plot(style=':')
goog.asfreq('BA').plot(style='--');
plt.legend(['input', 'resample', 'asfreq'], loc='upper left');


fig, ax = plt.subplots(2, sharex=True)
data = goog.iloc[:10]

data.asfreq('D').plot(ax=ax[0], marker='o')

data.asfreq('D', method='bfill').plot(ax=ax[1], style='-o')
data.asfreq('D', method='ffill').plot(ax=ax[1], style='--o')
ax[1].legend(["back-fill", "forward-fill"]);


fig, ax = plt.subplots(3, sharey=True)

# apply a frequency to the data
goog = goog.asfreq('D', method='pad')

goog.plot(ax=ax[0])
goog.shift(900).plot(ax=ax[1])
goog.tshift(900).plot(ax=ax[2])

# legends and annotations
local_max = pd.to_datetime('2007-11-05')
offset = pd.Timedelta(900, 'D')

ax[0].legend(['input'], loc=2)
ax[0].get_xticklabels()[2].set(weight='heavy', color='red')
ax[0].axvline(local_max, alpha=0.3, color='red')

ax[1].legend(['shift(900)'], loc=2)
ax[1].get_xticklabels()[2].set(weight='heavy', color='red')
ax[1].axvline(local_max + offset, alpha=0.3, color='red')

ax[2].legend(['tshift(900)'], loc=2)
ax[2].get_xticklabels()[1].set(weight='heavy', color='red')
ax[2].axvline(local_max + offset, alpha=0.3, color='red');


ROI = 100 * (goog.tshift(-365) / goog - 1)
ROI.plot()
plt.ylabel('% Return on Investment');


rolling = goog.rolling(40, center=True)

data = pd.DataFrame({'input': goog,
                     'one-year rolling_mean': rolling.mean(),
                     'one-year rolling_std': rolling.std()})
ax = data.plot(style=['-', '--', ':'])
ax.lines[0].set_alpha(0.3)


!curl -o FremontBridge.csv https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2024k    0 2024k    0     0   331k      0 --:--:--  0:00:06 --:--:--  462k


data = pd.read_csv('data/FremontBridge.csv', index_col='Date', parse_dates=True)
data.head()


data['Total'] = data["Fremont Bridge Total"]
data.dropna().describe() # have a look at summary statistics


%matplotlib inline
import seaborn; seaborn.set()


data.plot()
plt.ylabel('Hourly Bicycle Count');


weekly = data.resample('W').sum()
weekly.plot(style=[':', '--', '-'])
plt.ylabel('Weekly bicycle count');


daily = data.resample('D').sum()
daily.rolling(30, center=True).sum().plot(style=[':', '--', '-'])
plt.ylabel('mean hourly count');


daily.rolling(50, center=True,
              win_type='gaussian').sum(std=10).plot(style=[':', '--', '-']);


by_time = data.groupby(data.index.time).mean()
hourly_ticks = 4 * 60 * 60 * np.arange(6)
by_time.plot(xticks=hourly_ticks, style=[':', '--', '-']);


by_weekday = data.groupby(data.index.dayofweek).mean()
by_weekday.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(style=[':', '--', '-']);


weekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')
by_time = data.groupby([weekend, data.index.time]).mean()


import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
by_time.loc['Weekday'].plot(ax=ax[0], title='Weekdays',
                           xticks=hourly_ticks, style=[':', '--', '-'])
by_time.loc['Weekend'].plot(ax=ax[1], title='Weekends',
                           xticks=hourly_ticks, style=[':', '--', '-']);

	foo	bar
a	0.759907	0.458958
b	0.776779	0.767430
c	0.131552	0.740137

	California	Texas	New York	Florida	Illinois
area	4.239670e+05	6.956620e+05	1.412970e+05	1.703120e+05	1.499950e+05
pop	3.833252e+07	2.644819e+07	1.965113e+07	1.955286e+07	1.288214e+07
density	9.041393e+01	3.801874e+01	1.390767e+02	1.148061e+02	8.588376e+01

	A	B	C	D
0	-1.000000e+00	0.707107	-2.449294e-16	1.000000e+00
1	1.224647e-16	1.000000	-1.000000e+00	1.224647e-16
2	-2.449294e-16	-1.000000	7.071068e-01	7.071068e-01

	A	B	C
0	18.00	18.00	14.75
1	11.00	10.00	14.75
2	18.75	10.75	10.75

Python Operator	Pandas Method(s)
`+`	`add()`
`-`	`sub()`, `subtract()`
`*`	`mul()`, `multiply()`
`/`	`truediv()`, `div()`, `divide()`
`//`	`floordiv()`
`%`	`mod()`
`**`	`pow()`

	population	area
California	38332521	423967
Texas	26448193	695662
New York	19651127	141297
Florida	19552860	170312
Illinois	12882135	149995

Typeclass	Conversion When Storing NAs	NA Sentinel Value
`floating`	No change	`np.nan`
`object`	No change	`None` or `np.nan`
`integer`	Cast to `float64`	`np.nan`
`boolean`	Cast to `object`	`None` or `np.nan`

	2000	2010
California	33871648	37253956
New York	18976457	19378102
Texas	20851820	25145561

		total	under18
California	2000	33871648	9267089
California	2010	37253956	9284094
New York	2000	18976457	4687374
New York	2010	19378102	4318033
Texas	2000	20851820	5906301
Texas	2010	25145561	6879014

	2000	2010
California	0.273594	0.249211
New York	0.247010	0.222831
Texas	0.283251	0.273568

		data1	data2
a	1	0.482545	0.352967
a	2	0.574280	0.063582
b	1	0.102271	0.569372
b	2	0.753026	0.194597

	subject	Bob		Guido		Sue
	type	HR	Temp	HR	Temp	HR	Temp
year	visit
2013	1	48.0	38.1	19.0	38.4	52.0	38.8
2013	2	34.0	38.0	37.0	36.9	31.0	37.6
2014	1	41.0	37.0	52.0	38.9	38.0	37.4
2014	2	47.0	36.9	46.0	36.4	42.0	36.6

year	2000	2010
state
California	33871648	37253956
New York	18976457	19378102
Texas	20851820	25145561

	employee	group	hire_date
0	Bob	Accounting	2008
1	Jake	Engineering	2012
2	Lisa	Engineering	2004
3	Sue	HR	2014

	employee	group	hire_date	supervisor
0	Bob	Accounting	2008	Carly
1	Jake	Engineering	2012	Guido
2	Lisa	Engineering	2004	Guido
3	Sue	HR	2014	Steve

	group	skills
0	Accounting	math
1	Accounting	spreadsheets
2	Engineering	coding
3	Engineering	linux
4	HR	spreadsheets
5	HR	organization

	employee	group	name	salary
0	Bob	Accounting	Bob	70000
1	Jake	Engineering	Jake	80000
2	Lisa	Engineering	Lisa	120000
3	Sue	HR	Sue	90000

	state/region	ages	year	population
0	AL	under18	2012	1117489.0
1	AL	total	2012	4817528.0
2	AL	under18	2010	1130966.0
3	AL	total	2010	4785570.0
4	AL	under18	2011	1125763.0

	state	area (sq. mi)
0	Alabama	52423
1	Alaska	656425
2	Arizona	114006
3	Arkansas	53182
4	California	163707

	state/region	ages	year	population	state
2448	PR	under18	1990	NaN	NaN
2449	PR	total	1990	NaN	NaN
2450	PR	total	1991	NaN	NaN
2451	PR	under18	1991	NaN	NaN
2452	PR	total	1993	NaN	NaN

	A	B
0	0.183405	0.611853
1	0.304242	0.139494
2	0.524756	0.292145
3	0.431945	0.366362
4	0.291229	0.456070

	number	orbital_period	mass	distance	year
count	498.00000	498.000000	498.000000	498.000000	498.000000
mean	1.73494	835.778671	2.509320	52.068213	2007.377510
std	1.17572	1469.128259	3.636274	46.596041	4.167284
min	1.00000	1.328300	0.003600	1.350000	1989.000000
25%	1.00000	38.272250	0.212500	24.497500	2005.000000
50%	1.00000	357.000000	1.245000	39.940000	2009.000000
75%	2.00000	999.600000	2.867500	59.332500	2011.000000
max	6.00000	17337.500000	25.000000	354.000000	2014.000000

Aggregation	Description
`count()`	Total number of items
`first()`, `last()`	First and last item
`mean()`, `median()`	Mean and median
`min()`, `max()`	Minimum and maximum
`std()`, `var()`	Standard deviation and variance
`mad()`	Mean absolute deviation
`prod()`	Product of all items
`sum()`	Sum of all items

	count	mean	std	min	25%	50%	75%	max
method
Astrometry	2.0	2011.500000	2.121320	2010.0	2010.75	2011.5	2012.25	2013.0
Eclipse Timing Variations	9.0	2010.000000	1.414214	2008.0	2009.00	2010.0	2011.00	2012.0
Imaging	38.0	2009.131579	2.781901	2004.0	2008.00	2009.0	2011.00	2013.0
Microlensing	23.0	2009.782609	2.859697	2004.0	2008.00	2010.0	2012.00	2013.0
Orbital Brightness Modulation	3.0	2011.666667	1.154701	2011.0	2011.00	2011.0	2012.00	2013.0
Pulsar Timing	5.0	1998.400000	8.384510	1992.0	1992.00	1994.0	2003.00	2011.0
Pulsation Timing Variations	1.0	2007.000000	NaN	2007.0	2007.00	2007.0	2007.00	2007.0
Radial Velocity	553.0	2007.518987	4.249052	1989.0	2005.00	2009.0	2011.00	2014.0
Transit	397.0	2011.236776	2.077867	2002.0	2010.00	2012.0	2013.00	2014.0
Transit Timing Variations	4.0	2012.500000	1.290994	2011.0	2011.75	2012.5	2013.25	2014.0

	data1			data2
	min	median	max	min	median	max
key
A	0	1.5	3	3	4.0	5
B	1	2.5	4	0	3.5	7
C	2	3.5	5	3	6.0	9

Introducing Pandas Objects¶

The Pandas Series Object¶

Series as generalized NumPy array¶

Series as specialized dictionary¶

The Pandas DataFrame Object¶

DataFrame as a generalized NumPy array¶

DataFrame as specialized dictionary¶

Constructing DataFrame objects¶

From a single Series object¶

From a list of dicts¶

From a two-dimensional NumPy array¶

From a NumPy structured array¶

The Pandas Index Object¶

Index as immutable array¶

Index as ordered set¶

Data Indexing and Selection¶

Data Selection in Series¶

Series as dictionary¶

Series as one-dimensional array¶

Indexers: loc, iloc, and ix¶

Data Selection in DataFrame¶

DataFrame as two-dimensional array¶

Additional indexing conventions¶

Operating on Data in Pandas¶

Ufuncs: Index Preservation¶

UFuncs: Index Alignment¶

Index alignment in Series¶

Index alignment in DataFrame¶

Ufuncs: Operations Between DataFrame and Series¶

Handling Missing Data¶

Trade-Offs in Missing Data Conventions¶

Missing Data in Pandas¶

None: Pythonic missing data¶

NaN: Missing numerical data¶

NaN and None in Pandas¶

Operating on Null Values¶

Detecting null values¶

Dropping null values¶

Filling null values¶

Hierarchical Indexing¶

A Multiply Indexed Series¶

The bad way¶

The Better Way: Pandas MultiIndex¶

MultiIndex as extra dimension¶

Methods of MultiIndex Creation¶

Explicit MultiIndex constructors¶

MultiIndex level names¶

MultiIndex for columns¶

Indexing and Slicing a MultiIndex¶

Multiply indexed Series¶

Multiply indexed DataFrames¶

Rearranging Multi-Indices¶

Sorted and unsorted indices¶

Stacking and unstacking indices¶

Index setting and resetting¶

Data Aggregations on Multi-Indices¶

Combining Datasets: Concat and Append¶

Simple Concatenation with pd.concat¶

Duplicate indices¶

Ignoring the index¶

Adding MultiIndex keys¶

Concatenation with joins¶

The append() method¶

Combining Datasets: Merge and Join¶

Relational Algebra¶

Categories of Joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Specification of the Merge Key¶

The on keyword¶

The left_on and right_on keywords¶

The left_index and right_index keywords¶

Specifying Set Arithmetic for Joins¶

Overlapping Column Names: The suffixes Keyword¶

Example: US States Data¶

Aggregation and Grouping¶

Planets Data¶

Simple Aggregation in Pandas¶

GroupBy: Split, Apply, Combine¶

`Series` as generalized NumPy array¶

`None`: Pythonic missing data¶

`NaN`: Missing numerical data¶

Simple Concatenation with `pd.concat`¶

The `append()` method¶

The `on` keyword¶

The `left_on` and `right_on` keywords¶

The `left_index` and `right_index` keywords¶

Overlapping Column Names: The `suffixes` Keyword¶

Native Python dates and times: `datetime` and `dateutil`¶

Typed arrays of times: NumPy's `datetime64`¶

Regular sequences: `pd.date_range()`¶

	key	data1	data2
0	A	0.000000	5
1	B	0.142857	0
2	C	0.166667	3
3	A	0.375000	3
4	B	0.571429	7
5	C	0.416667	9

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

class	First	Second	Third
sex
female	0.968085	0.921053	0.500000
male	0.368852	0.157407	0.135447

	class	First	Second	Third
sex	age
female	(0, 18]	0.909091	1.000000	0.511628
female	(18, 80]	0.972973	0.900000	0.423729
male	(0, 18]	0.800000	0.600000	0.215686
male	(18, 80]	0.375000	0.071429	0.133663

	fare	(-0.001, 14.454]			(14.454, 512.329]
	class	First	Second	Third	First	Second	Third
sex	age
female	(0, 18]	NaN	1.000000	0.714286	0.909091	1.000000	0.318182
female	(18, 80]	NaN	0.880000	0.444444	0.972973	0.914286	0.391304
male	(0, 18]	NaN	0.000000	0.260870	0.800000	0.818182	0.178571
male	(18, 80]	0.0	0.098039	0.125000	0.391304	0.030303	0.192308

	year	month	day	births
count	15547.000000	15547.000000	15067.000000	15547.000000
mean	1979.037435	6.515919	17.769894	9762.293561
std	6.728340	3.449632	15.284034	28552.465810
min	1969.000000	1.000000	1.000000	1.000000
25%	1974.000000	4.000000	8.000000	4358.000000
50%	1979.000000	7.000000	16.000000	4814.000000
75%	1984.000000	10.000000	24.000000	5289.500000
max	2008.000000	12.000000	99.000000	199622.000000

	year	month	day	gender	births
0	1969	1	1.0	F	4046
1	1969	1	1.0	M	4440
2	1969	1	2.0	F	4454
3	1969	1	2.0	M	4548
4	1969	1	3.0	F	4548

gender	F	M
decade
1960	1753634	1846572
1970	16263075	17121550
1980	18310351	19243452
1990	19479454	20420553
2000	18229309	19106428

	births
2012-01-01	4009.225
2012-01-02	4247.400
2012-01-03	4500.900
2012-01-04	4571.350
2012-01-05	4603.625

Code	Meaning	Time span (relative)	Time span (absolute)
`Y`	Year	± 9.2e18 years	[9.2e18 BC, 9.2e18 AD]
`M`	Month	± 7.6e17 years	[7.6e17 BC, 7.6e17 AD]
`W`	Week	± 1.7e17 years	[1.7e17 BC, 1.7e17 AD]
`D`	Day	± 2.5e16 years	[2.5e16 BC, 2.5e16 AD]
`h`	Hour	± 1.0e15 years	[1.0e15 BC, 1.0e15 AD]
`m`	Minute	± 1.7e13 years	[1.7e13 BC, 1.7e13 AD]
`s`	Second	± 2.9e12 years	[ 2.9e9 BC, 2.9e9 AD]
`ms`	Millisecond	± 2.9e9 years	[ 2.9e6 BC, 2.9e6 AD]
`us`	Microsecond	± 2.9e6 years	[290301 BC, 294241 AD]
`ns`	Nanosecond	± 292 years	[ 1678 AD, 2262 AD]
`ps`	Picosecond	± 106 days	[ 1969 AD, 1970 AD]
`fs`	Femtosecond	± 2.6 hours	[ 1969 AD, 1970 AD]
`as`	Attosecond	± 9.2 seconds	[ 1969 AD, 1970 AD]