1.6. Python Functions#
This week, we will need to create our own Python functions as part of running a permutation test.
Here we will review what function are and how we can create our own using Python code.
This is a kind of Python tangent to our main stats objective for the week.
1.6.1. Set up Python libraries#
# Set-up Python libraries - you need to run this but you don't need to change it
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import seaborn as sns
sns.set_theme(style='white')
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
1.6.2. Import the data#
We need some data to work with. Let’s use the good old Oxford Weather dataset.
weather = pd.read_csv("https://raw.githubusercontent.com/jillxoreilly/StatsCourseBook_2024/main/data/OxfordWeather.csv")
display(weather)
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[2], line 1
----> 1 weather = pd.read_csv("https://raw.githubusercontent.com/jillxoreilly/StatsCourseBook_2024/main/data/OxfordWeather.csv")
2 display(weather)
File /opt/anaconda3/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
944 dtype_backend=dtype_backend,
945 )
946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)
File /opt/anaconda3/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:611, in _read(filepath_or_buffer, kwds)
608 _validate_names(kwds.get("names", None))
610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
613 if chunksize or iterator:
614 return parser
File /opt/anaconda3/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
1445 self.options["has_index_names"] = kwds["has_index_names"]
1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)
File /opt/anaconda3/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1705, in TextFileReader._make_engine(self, f, engine)
1703 if "b" not in mode:
1704 mode += "b"
-> 1705 self.handles = get_handle(
1706 f,
1707 mode,
1708 encoding=self.options.get("encoding", None),
1709 compression=self.options.get("compression", None),
1710 memory_map=self.options.get("memory_map", False),
1711 is_text=is_text,
1712 errors=self.options.get("encoding_errors", "strict"),
1713 storage_options=self.options.get("storage_options", None),
1714 )
1715 assert self.handles is not None
1716 f = self.handles.handle
File /opt/anaconda3/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:718, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
715 codecs.lookup_error(errors)
717 # open URLs
--> 718 ioargs = _get_filepath_or_buffer(
719 path_or_buf,
720 encoding=encoding,
721 compression=compression,
722 mode=mode,
723 storage_options=storage_options,
724 )
726 handle = ioargs.filepath_or_buffer
727 handles: list[BaseBuffer]
File /opt/anaconda3/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:377, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
374 if content_encoding == "gzip":
375 # Override compression based on Content-Encoding header
376 compression = {"method": "gzip"}
--> 377 reader = BytesIO(req.read())
378 return IOArgs(
379 filepath_or_buffer=reader,
380 encoding=encoding,
(...)
383 mode=fsspec_mode,
384 )
386 if is_fsspec_url(filepath_or_buffer):
File /opt/anaconda3/anaconda3/lib/python3.11/http/client.py:489, in HTTPResponse.read(self, amt)
487 else:
488 try:
--> 489 s = self._safe_read(self.length)
490 except IncompleteRead:
491 self._close_conn()
File /opt/anaconda3/anaconda3/lib/python3.11/http/client.py:638, in HTTPResponse._safe_read(self, amt)
631 def _safe_read(self, amt):
632 """Read the number of bytes requested.
633
634 This function should be used when <amt> bytes "should" be present for
635 reading. If the bytes are truly not available (due to EOF), then the
636 IncompleteRead exception can be used to detect the problem.
637 """
--> 638 data = self.fp.read(amt)
639 if len(data) < amt:
640 raise IncompleteRead(data, amt-len(data))
File /opt/anaconda3/anaconda3/lib/python3.11/socket.py:706, in SocketIO.readinto(self, b)
704 while True:
705 try:
--> 706 return self._sock.recv_into(b)
707 except timeout:
708 self._timeout_occurred = True
File /opt/anaconda3/anaconda3/lib/python3.11/ssl.py:1315, in SSLSocket.recv_into(self, buffer, nbytes, flags)
1311 if flags != 0:
1312 raise ValueError(
1313 "non-zero flags not allowed in calls to recv_into() on %s" %
1314 self.__class__)
-> 1315 return self.read(nbytes, buffer)
1316 else:
1317 return super().recv_into(buffer, nbytes, flags)
File /opt/anaconda3/anaconda3/lib/python3.11/ssl.py:1167, in SSLSocket.read(self, len, buffer)
1165 try:
1166 if buffer is not None:
-> 1167 return self._sslobj.read(len, buffer)
1168 else:
1169 return self._sslobj.read(len)
KeyboardInterrupt:
1.6.3. What is a function?#
A function is a computer programme that takes in some information (an input), does something with it, and returns an output.
Functions were introduced in DataCamp and you could review this if helpful after reading this section.
We have been using Python functions for the last several weeks, mainy from the function libraries pandas
and seaborn
. For example the function df.mean()
gets the mean of each column in a dataframe.
Let’s make our own simple function to get the mean for a single column of a dataframe:
def myMean(x):
m=sum(x)/len(x)
return m
The input is
x
The output is
m
Inside the function, we calculate
m
fromx
You will notice if you ran the code block above that nothing seemed to happen. That’s because we just created the function (computer programme) but didn’t run it.
Now, having created the function, we can run or call it as follows:
myMean(weather.Rainfall_mm)
1.7869643833314295
What happened?
We called the function by saying
myMean()
We gave it an input (inside the brackets,
weather.Rainfall_mm
The function calculated the mean by adding up the values in th input column and dividing by the humber of values (length of the columns)
The function gave us an output (shown below the code box), of 1.79mm, which is the mean rainfall
Let’s just check using the built-in function that we are used to, df.mean()
weather.Rainfall_mm.mean()
1.7869643833312312
Yep, same answer.
Note#
You have to run the code block defining the function before you can call it, otherwise it won’t have been created and won’t exist!
1.6.4. Difference of means#
As another example, let’s define a function that takes in two inputs and finds the difference of their means:
def dMeans(x,y):
mx = sum(x)/len(x)
my = sum(y)/len(y)
diff = mx-my
return diff
Note that this function now has two inputs: x and y
The function does the following
calculate the mean for x as
mx
calculate the mean for y as
my
get the difference
mx-my
Let’s use it to calculate the difference in mean rainfall between November and May
# find the relevant rows and column in the dataframe and give them a name
nov = weather.query('Month == "Nov"').Rainfall_mm
may = weather.query('Month == "May"').Rainfall_mm
# run the function dMeans
dMeans(nov,may)
# note we could have done the same thing in a single line:
# dMeans(weather.query('Month == "Oct"').Rainfall_mm, weather.query('Month == "May"').Rainfall_mm)
# the only reason I didn't do this was that I think the version above is a bit easier to follow as a student
0.5064019851116548
Apparently it rains more in November than May, which is unsuprising; the mean daily rainfall is 0.51 mm greater in November.
Note that which input (nov or may) gets called x and y within the function is determined by the order that we write them within the function’s parentheses
In the function call we have:
def dMeans(x,y):
meaning that when we call the function, whatever is first in the brackets becomes x
and whatever is second becomes y
. So when we call
dMeans(nov,may)
nov becomes x and
may becomes y
The function returns mean(x) - mean(y) so this is rainfall in November-May; if the output is a positive number this means that there was more rain in November than May.
If we called dMeans(may,nov)
we would get rainfall in May-November - presumably a negative number, as the rainfall in November is higher.
1.6.5. Mean difference#
Finally, let’s define a function that takes in two matched pairs inputs and finds the mean difference (within pairs).
For example, say we want to know if the weather was warmer in 2001 than in 1901.
Instead of comparing the average temperature for all 365 days in 1901 to the average for all 365 days in 2001, we could compare each date in 1901 to the same date in 2001 - so we find the difference in temperature between Jan 1st 1901 and Jan 1st 2001, then the same for Jan 2nd etc.
Naturally this will only work if the data are actually matched and hence the two samples have the same \(n\).
def mDiff(x,y):
diff = x-y
meanDiff = sum(diff)/len(diff)
return meanDiff
Eagle-eyed readers may realise that these two functions, given the same data, give the same answer. However, we will see later in this chapter that once we start randomly shuffling data (as in a permutation test), the difference of means and mean difference behave quite differently.