Introduction to pandas in Python
by K. Yue
1. Overview of the pandas module
Examples:
pandas_series_1.py: Please download, run and annotate the program.
Note that numpy.nan is of the numpy data type of numpy.float64 (a 64-bit floating-point number).
import pandas as pd
import numpy as np
# Create a simple Series from a list of values; default integer index is used
data = [10, 20, 30, 40, 50]
series_1 = pd.Series(data)
print(f"series_1: Series from a list: {data}")
print(series_1)
print()
# An empty series.
series_2 = pd.Series()
print("series_2: empty series: []")
print(series_2)
print()
# A series with custom labels/indices.
test_data = [90, 77, 90]
test_index = ["test_1", "test_2", "test_3"]
series_3 = pd.Series(test_data, index=test_index)
print(f"series_3: explicitly labeled series.")
print(f"element values: {test_data}")
print(f"custom index: {test_index}")
print(series_3)
print()
# A series with custom labels/indices and a 'not a number' value (np.nan)
test_data = [90, 77, 90, np.nan]
test_index = ["test_1", "test_2", "test_3", "test_4"]
series_4 = pd.Series(test_data, index=test_index)
print(f"series_4: explicitly labeled series.")
print(f"element values: {test_data}")
print(f"custom index: {test_index}")
print(series_4)
print()
# Example methods of Series
print(f"series_4.head(): {series_4.head()}")
print()
print(f"series_4.describe(): {series_4.describe()}")
print()
print(f"series_4.value_counts(): {series_4.value_counts()}")
print()
print(f"series_4.sort_values():\n{series_4.sort_values()}")
print()
print(f"series_4:\n{series_4}")
print()
print(f"series_4.dropna():\n{series_4.dropna()}")
print()
print(f"series_4:\n{series_4}")
pandas_df_1.py: Please download, run and annotate the program.
import pandas as pd
import numpy as np
data_1 = {
'id': ['1011231', '2201031', '0882541', '3452211'],
'Name': ['John Sawyer', 'Alice Johnson', 'Lee Tran', 'Bobby Jones'],
'major': ['CS', 'CIS', 'CS', 'MATH']
}
df1 = pd.DataFrame(data_1)
print(f"df1:\n{df1}")
print()
data_2 = {
'CIS': [110,151,138,145,120,114],
'CS': [132,153,128,160,130,126],
'ITEC': [80,99,110,115,121,115]
}
year = [2021,2022,2023,2024,2025,2026]
df2 = pd.DataFrame(data_2, index=year)
print(f"df2:\n{df2}")
print()
# sum
print(f"df2.sum(axis=0):\n{df2.sum(axis=0)}")
print()
print(f"df2.sum(axis=1):\n{df2.sum(axis=1)}")
print()
print("Descriptive statistics for all columns in df2 (df2.describe()):")
print(df2.describe())
print()
print(f"df2.head(3):\n{df2.head(3)}")
print()
print(f"df2.tail(3):\n{df2.tail(3)}")
print()
print(f"df2.loc[2023]:\n{df2.loc[2023]}")
print()
print(f"df2.loc[2025]:\n{df2.loc[2025]}")
print()