Learning by practicing

Pandas String Operations, etc.

Still learning about Pandas

#!/usr/bin/env python3

'''
    Pandas strings, etc

'''

import pandas as pd
import numpy as np
import string

def main():
    # Create the first series consisting of name and age
    series_name_age = pd.Series(np.random.randint(1,50,26), name='age' ,index=list(string.ascii_lowercase[:26]))
    series_name_age.index.name = 'Name'  
    print('[*] Content of series_name_age \n{}'.format(series_name_age))

    #Create a second series consisting of name and income
    series_name_income = pd.Series(np.random.randint(100000,500000,26), name='Income', index=list(string.ascii_lowercase[:26]))
    series_name_income.index.name = 'Name' 
    print('\n[*] Content of series_name_income \n{}'.format(series_name_income))

    # Considering the values reported in the income series, print the salary of those making above 400K
    print('\n[*] Here are the list of people making above 400K \n {}'.format(series_name_income > 400000))
    
    # While the above only showed True or False, let's see the actual values
    print('\n[*] Actual income values \n{}'.format(series_name_income[series_name_income > 400000]))


    # Check to see if everyone makes a salary above 100000
    print('\n[*] Does everyone make above 100000? \n{}'.format((series_name_income > 100000).all()))

    # Check to see if everyone makes a salary above 400000
    print('\n[*] Does everyone make above 400000? \n{}'.format((series_name_income > 400000).all()))

    # Check to see if anyone, not everyone makes above 450000
    print('\n[*] Does anyone make above 450000? \n{}'.format((series_name_income > 450000).any()))


    # To convert a series to a different type just do as shown below:
    print('\n[*] Series_name_income as String \n{}'.format(series_name_income.to_string()))
    print('\n[*] Series_name_income as List \n{}'.format(series_name_income.to_list()))
    print('\n[*] Series_name_income as Dict \n{}'.format(series_name_income.to_dict()))
    print('\n[*] Series_name_income as Json \n{}'.format(series_name_income.to_json()))


    #Let's test to see if any of the values which were generated for income or age were duplicated
    print('\n[*] These are the unique values for age: \n{}'.format(series_name_age.unique()))
    print('\n[*] These are the unique values for income: \n{}'.format(series_name_income.unique()))

    # Let's now look for numbers which might have been duplicated and the number of times they appear
    print('\n[*] Age values usage and their occurrences: \n{}'.format(series_name_age.value_counts()))
    print('\n[*] Income value usage and their occurrences: \n{}'.format(series_name_income.value_counts()))

    # Let's get the minimum income and age
    print('\n[*] The minimum value for age: \n{}'.format(series_name_age.min()))
    print('\n[*] The minimum value for income: \n{}'.format(series_name_income.min()))

    # Let's get the maximum income and age
    print('\n[*] The max value for age: \n{}'.format(series_name_age.max()))
    print('\n[*] The max value for income: \n{}'.format(series_name_income.max()))

    # Now that we have the min and max of age and income, let's find the mean
    print('\n[*] The mean value for age to two decimals: \n{:.2f}'.format(series_name_age.mean()))
    print('\n[*] The mean value for income to two decimals: \n{:.2f}'.format(series_name_income.mean()))


if __name__ == '__main__':
    main()

Posts in this series:
Beginning Numpy
Beginning Pandas
Pandas String Operations, etc.

Pandas GroupBy

Learning about Pandas GroupBy from the perspective of the Iris Dataset

#!/usr/bin/env python3

'''
    Using the iris dataset to learn more about groupby
'''

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

def main():
    iris_df = pd.read_csv('./iris.data')
    print('[*] First 10 records \n {}'.format(iris_df.head(10)))

    #Get the column names
    print('\n[*] Column names \n{}'.format(iris_df.columns))

    # To determine the different species within the dataset
    print('\n[*] The numer of unique species is: {}'.format(len(set(iris_df.species))))
    print('[*] The unique species in the dataset are:  \n{}'.format(set(iris_df.species)))
    
    # Let's now group these by species
    group_by_species = iris_df.groupby('species')
    
    # Get the group and their indicies
    print('\n[*] Iris dataset now grouped by species \n {}'.format(group_by_species.indices))
    
    # Let's get the keys for above
    print('\n[*] Iris dataset keys \n {}'.format(group_by_species.indices.keys()))

    # Let's get the values for above
    print('\n[*] Iris dataset values \n {}'.format(group_by_species.indices.values()))

    # Iterating through the group
    for key, value in group_by_species:
        print('\n \\//-->>   Group Starts Here   <<--\\//')
        print('\n [*]{0} {1} \n'.format(key, value))
        print('\n \\//-->>   Group Ends Here   <<--\\// \n')


    # Rather than iterating, we could have just view the contents of the list
    print('\n\n[*] List view - Datasets group by species \n {}'.format(list(group_by_species)))

    # Get a specific group
    print('\n[*] Data for the Iris-setosa group \n {}'.format(group_by_species.get_group('Iris-setosa')))


if __name__ == '__main__':
    main()

Posts in this series:
Beginning Numpy
Beginning Pandas
Pandas String Operations, etc.

Pandas DataFrame Basics

* While the Pandas Series is like an array, the Pandas DataFrame is like a spreadsheet.
* Have both rows and columns which are generally labeled
* Rows represents the index
* DataFrame has two axis. These are "axis=0" and "axis=1".
* Axis=0 represents the columns. As in, if you wish to access all rows for a specific column, you should use "axis=0"
* Axis=1 represents the rows. This is if you wish to access all columns for a given row, ou use "axis=1"

#!/usr/bin/env python3

import numpy as np
import pandas as pd


def main():
    my_data = {
            'User-1': [10, 'M', 'Cricketer'],
            'User-2': [30, 'F', 'BasketBall' ],
            'User-3': [15, 'F', 'Table Tennis'],
            'User-4': [100, 'M', 'History'],
            'User-5': [50, 'F', 'Soccer']
            }

    users_df = pd.DataFrame(my_data)
    print('\n[*] Current view of the dataframe \n {}'.format(users_df))
    print('\n[*] Here are your indexes \n {}'.format(users_df.index))
    print('\n[*] Here are your columns \n {}'.format(users_df.columns))
    print('\n[*] Here are your values \n {}'.format(users_df.values))

    # Add a new column
    users_df['new_index'] = ['Num', 'Sex', 'Sports']
    print('[*] The new dataframe \n {}'.format(users_df))

    #Change the index to the newly created column and make the change on the existing dataframe
    users_df.set_index('new_index', inplace=True)
    
    # Add a name to the newly created index
    users_df.columns.name = 'New Index'
    print('\n[*] users_df with new index column \n{}'.format(users_df))

    # to access a single column
    print('\n[*] Print information on User-2 \n {}'.format(users_df['User-2']))
    
    # To access multiple columns, leverage a list
    print('\n[*] Print information on User-2 and User-5 \n {}'.format(users_df[['User-2', 'User-5']]))

    # Access information for the entire row for sports
    print('\n[*] Print information on User-2 and User-5 \n {}'.format(users_df.loc['Num']))

    # To figure out the type of data returned
    print('\n[*] Type for the return column \n {}'.format(type(users_df.loc['Num'])))

    # Print inforation for User-3 and Sports. Notice the usage of '.at'. Also this has to be row,column
    print('\n[*] Print information on User 2 sports  \n {}'.format(users_df.at['Sports', 'User-2']))

    # Let's now transpose our dataframe. That is make the columns rows and the rows into columns
    users_transpose_df = users_df.T
    print('\n[*] Here we transpose the dataframe. We made the columns into rows and the rows into columns \n {}'.format(users_transpose_df))
    
    # find everyone whose Num is less than 50
    print('\n[*] Here is everyone whose age is less than 50 \n {}'.format((users_transpose_df.Num < 50))) 

    # Create a new column based on the information just returned
    users_transpose_df['derived_num_lt_50'] = users_transpose_df.Num < 50

    print('\n[*] Here is your new dataframe with its derived column \n {}'.format(users_transpose_df))

    # Let's now add a row and print it out
    users_transpose_df.loc['User-6'] = [70, 'M', 'Volleyball', 0]
    print('\n[*] New row added for User-6\n {}'.format(users_transpose_df))

    # Let's now describe the dataframe
    print('\n[*] Describing the dataframe \n {}'.format(users_transpose_df.describe()))

    # We can also describe specific column. In this case the Num
    print('\n[*] Describing the Num column \n {}'.format(users_transpose_df.Num.describe()))

    # Whereas the index was set above, we can reset the index
    print('\n[*] Index reset. Note the new index to the left with the incrementing numbers \n {}'.format(users_transpose_df.reset_index()))
    

if __name__ == '__main__':
    main()

Posts in this series:
Beginning Numpy
Beginning Pandas
Pandas String Operations, etc.

Beginning Pandas

In this second part of my learning ...

* Pandas work well with text based data and is extremely powerful.
* Pandas allows us to look at data from both a macro and micro perspective.
* One of its main features it is the DataFrame. DataFrames can be considered similar to Excel spreadsheets.
* It also has the capability for Series, which are array like.
* Pandas index does not have to start at 0
* Pandas index does not have to be ordered
* Pandas index does not have to be a number. Can be a list of strings
* Pandas index are very flexible
* When using series, series provide the index and its values
* If a index is not specifically defined, Pandas will create an incrementing index
* Can even create Pandas series from Python dictionaries
* Pandas has the capability to do both position-based and label-based lookup. The two should not be confused
* To use label based lookup, use ".loc"
* For position based lookup use the ".iloc"
* Can pass a list of index values to both the ".loc" and ".loc"
* Alternatively, we can use the ".ix". This tries to first lookup based on label and if that does not work, then by position.
* Pandas is built on Numpy

#!/usr/bin/env python3

import numpy as np
import pandas as pd


'''
side_by_side function from Wes McKinney, author of Pandas
if using python3, see this link for error you may get relating to the adjoin function
https://stackoverflow.com/questions/38156965/pandas-cannot-import-name-adjoin
'''
def side_by_side(*objs, **kwds):
  from pandas.io.formats.printing import adjoin
  space = kwds.get('space', 4)
  reprs = [repr(obj).split('\n') for obj in objs]
  print(adjoin(space, *reprs))



def main():
    print('[*] You are running pandas version {}'.format(pd.__version__))
    print('[*] You are running numpy version {}'.format(np.__version__))

    # Create a panda series with 6 random values between 1 and 100 from numpy
    rand_series = pd.Series(np.random.randint(1,100,6), index=['rand0', 'rand1', 'rand2', 'rand3', 'rand4', 'rand5'], name='Rand Series')
    rand_series.index.name = 'Rand Value'
    print('\n[*] \n{}'.format(rand_series))

    # We can also create a series from a python list
    list_series = pd.Series([98, 80, -50, 70, -10, 15], index=['numA','numB', 'numC', 'numD', 'numE', 'numF' ], name='list_series')
    list_series.index.name = 'List Value'
    print('\n[*] Current values in list_series \n{}'.format(list_series))

    # Create series from python dictionary, when used as below, all values were converted to Not a Number (NAN)
    #dict_series = pd.Series({10:1, 8:2, 11:4, 20:3, 5:9, 6:20}, index=['dict0', 'dict1', 'dict2', 'dict3', 'dict4', 'dict5'])

    # Use this instead
    dict_series = pd.Series({'dict0':1, 'dict1':2, 'dict2':4, 'dict3':3, 'dict4':9, 'dict5':20}, name='Dict Series')
    dict_series.index.name = 'Dict Value'
    print('\n[*] Current values in the dict_series \n {}'.format(dict_series))
    print()
    #Now call side_by_side function which was defined above
    print('[*] The 3 series side-by-side {} \n'.format(side_by_side(rand_series, list_series, dict_series)))

    #print the value of a specific index using its string name for a label based lookup
    print('\n[*] The value of numB is:{}\n '.format(list_series['numB']))

    #print the value of a specific location based on label based lookup
    print('\n[*] The value of rand1 is: \n{} '.format(rand_series.loc[['rand1', 'rand5']]))

    #for position based lookup used iloc
    print('\n[*] The value of numE is:{}\n '.format(list_series.iloc[5]))

    #We can use the .ix if we are not sure about the label or the position. It tries to lookup labels first then positions
    print('\n[*] The value of rand2 is:\n{} '.format(rand_series.ix[['rand2', 'rand3']]))
    


if __name__ == '__main__':
    main()

References
NumPy Reference
pandas: powerful Python data analysis toolkit
Pandas Series

Posts in this series:
Beginning Numpy
Beginning Pandas
Pandas String Operations, etc.