def load_and_preprocess_dataset(path: str) -> pd.DataFrame:
# Load the original dataset
df = pd.read_csv(path, encoding='utf8')
# Drop all irrelevant columns
potential_drops = ['Durchschnittliches vertikales Verhältnis', 'Ø vertikale Bewegung', 'Ø Bodenkontaktzeit',
'Training Stress Score®', 'Ø Leistung', 'Max. Leistung', 'Grit', 'Flow', 'Ø Swolf',
'Ø Schlagrate', 'Tauchzeit', 'Oberflächenpause', 'Dekompression', 'Favorit', 'Titel',
'Zeit in Bewegung', 'Verstrichene Zeit', 'Beste Rundenzeit', 'Wiederholungen insgesamt',
'Sätze insgesamt']
df.drop(columns=[c for c in potential_drops if c in df.columns],inplace=True)
# Transform data date columns and add a new column for the time
df.Datum = pd.to_datetime(df.Datum)
df.insert(loc=2, column='Start Time', value=df['Datum'].dt.time)
# Replace all -- values with NaN
df.replace('--', np.nan, inplace=True)
# Rename different types of running activities to 'Laufen'
df.loc[df['Aktivitätstyp'].isin(['Indoor-Laufen', 'Laufbandtraining']), 'Aktivitätstyp'] = 'Laufen'
# Transform the time column to numeric values in minutes
for col in ['Zeit', 'Ø Pace', 'Beste Pace']:
df[col] = df[col].astype(str).apply(lambda t: from_text_to_numeric(time=t, output='m'))
# Try to transform all columns to numeric datatype
for col in df.columns:
try:
df[col] = df[col].astype(str).str.replace(',', '').astype(float)
except:
pass
# Rename the columns to replace the Ø sign with Avg
df.columns = [c.replace('Ø ', 'Avg_').replace(' ', '_') for c in df.columns]
# Replace 0.0 values in the Avg_Herzfrequenz column with NaN as this is not possible
df.loc[df['Avg_Herzfrequenz'] == 0.0, 'Avg_Herzfrequenz'] = np.nan
# Return the DataFrame
return df
# Show first and last 5 rows
data = load_and_preprocess_dataset('Activities_2022_Cleaned.csv')
pd.concat([data.head(), data.tail()])