In [1]:
import polars as pl
import random
Read in CSV data¶
In [2]:
policies = pl.read_csv('./raw-data/policies.csv')
policies
Out[ ]:
shape: (678_013, 10)
| IDpol | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Area | Density | Region |
|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | i64 | i64 | i64 | str | str | str | i64 | str |
| 1 | 5 | 0 | 55 | 50 | "B12" | "Regular" | "D" | 1217 | "Rhone-Alpes" |
| 3 | 5 | 0 | 55 | 50 | "B12" | "Regular" | "D" | 1217 | "Rhone-Alpes" |
| 5 | 6 | 2 | 52 | 50 | "B12" | "Diesel" | "B" | 54 | "Picardie" |
| 10 | 7 | 0 | 46 | 50 | "B12" | "Diesel" | "B" | 76 | "Aquitaine" |
| 11 | 7 | 0 | 46 | 50 | "B12" | "Diesel" | "B" | 76 | "Aquitaine" |
| … | … | … | … | … | … | … | … | … | … |
| 6114326 | 4 | 0 | 54 | 50 | "B12" | "Regular" | "E" | 3317 | "Provence-Alpes-Cotes-D'Azur" |
| 6114327 | 4 | 0 | 41 | 95 | "B12" | "Regular" | "E" | 9850 | "Ile-de-France" |
| 6114328 | 6 | 2 | 45 | 50 | "B12" | "Diesel" | "D" | 1323 | "Rhone-Alpes" |
| 6114329 | 4 | 0 | 60 | 50 | "B12" | "Regular" | "B" | 95 | "Bourgogne" |
| 6114330 | 7 | 6 | 29 | 54 | "B12" | "Diesel" | "B" | 65 | "Aquitaine" |
In [3]:
exposure = pl.read_csv('./raw-data/exposure.csv')
exposure
Out[ ]:
shape: (678_013, 2)
| IDpol | Exposure |
|---|---|
| i64 | f64 |
| 2124053 | 0.53 |
| 1049168 | 0.1 |
| 134313 | 1.0 |
| 1145209 | 0.06 |
| 2281532 | 0.5 |
| … | … |
| 4134506 | 1.0 |
| 1037983 | 0.04 |
| 3197389 | 1.0 |
| 25934 | 1.0 |
| 72150 | 1.0 |
In [4]:
claims_amounts = pl.read_csv('./raw-data/claims_amounts.csv')
claims_amounts
Out[ ]:
shape: (35_560, 2)
| IDpol | ClaimAmount |
|---|---|
| i64 | f64 |
| 1 | 0.0 |
| 3 | 0.0 |
| 5 | 0.0 |
| 10 | 0.0 |
| 11 | 0.0 |
| … | … |
| 6113793 | 1769.88 |
| 6113817 | 1288.28 |
| 6113834 | 10290.0 |
| 6113834 | 1940.4 |
| 6113971 | 1452.0 |
Policies¶
In [5]:
# Assign random number between 1-5
policies = policies.with_columns([
pl.Series("Group", [random.randint(1, 5) for _ in range(policies.height)]).cast(pl.String)
])
# Convert Integer columns to Floats for modelling
policies = policies.with_columns([
pl.col(col).cast(pl.Float64) for col, dtype in zip(policies.columns, policies.dtypes) if dtype == pl.Int64
])
# Convert IDs to Integer type
policies = policies.with_columns(IDpol=pl.col('IDpol').cast(pl.Int64))
Claims¶
In [6]:
# For claim counts take the sum of claims for each ID
claim_counts = (
claims_amounts
.with_columns(ClaimCount = pl.lit(1))
.group_by('IDpol')
.agg(pl.sum('ClaimCount').alias('ClaimCount'))
)
Frequency Data¶
In [7]:
# Join policies, exposure, claim_counts datasets together
# Populate missing ClaimCount with 0
frequency_set = (
policies
.join(exposure, on = 'IDpol', how='inner')
.join(claim_counts, on = 'IDpol', how='left')
.with_columns(ClaimCount = pl.col('ClaimCount').fill_null(0))
)
# Write data as parquet
frequency_set.write_parquet('./processed-data/frequency_set.parquet')
Severity Data¶
In [8]:
# Join policies, claim_amounts and inner join claim_counts
# Populate missing ClaimAmount with 0
# Cap ClaimAmount at 50k.
severity_set = (
policies
.join(claims_amounts, on = 'IDpol', how='left')
.join(claim_counts, on = 'IDpol', how='inner')
.with_columns(ClaimAmount = pl.col('ClaimAmount').fill_null(0))
.drop('ClaimCount')
.with_columns(ClaimAmount = pl.when(pl.col('ClaimAmount') > 50_000).then(pl.lit(50_000)).otherwise(pl.col('ClaimAmount')))
)
severity_set.write_parquet('./processed-data/severity_set.parquet')