In [1]:
import polars as pl
import random
Read in CSV data¶
In [2]:
policies = pl.read_csv('./raw-data/policies.csv')
policies
Out[ ]:
shape: (678_013, 10)
IDpol | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Area | Density | Region |
---|---|---|---|---|---|---|---|---|---|
i64 | i64 | i64 | i64 | i64 | str | str | str | i64 | str |
1 | 5 | 0 | 55 | 50 | "B12" | "Regular" | "D" | 1217 | "Rhone-Alpes" |
3 | 5 | 0 | 55 | 50 | "B12" | "Regular" | "D" | 1217 | "Rhone-Alpes" |
5 | 6 | 2 | 52 | 50 | "B12" | "Diesel" | "B" | 54 | "Picardie" |
10 | 7 | 0 | 46 | 50 | "B12" | "Diesel" | "B" | 76 | "Aquitaine" |
11 | 7 | 0 | 46 | 50 | "B12" | "Diesel" | "B" | 76 | "Aquitaine" |
… | … | … | … | … | … | … | … | … | … |
6114326 | 4 | 0 | 54 | 50 | "B12" | "Regular" | "E" | 3317 | "Provence-Alpes-Cotes-D'Azur" |
6114327 | 4 | 0 | 41 | 95 | "B12" | "Regular" | "E" | 9850 | "Ile-de-France" |
6114328 | 6 | 2 | 45 | 50 | "B12" | "Diesel" | "D" | 1323 | "Rhone-Alpes" |
6114329 | 4 | 0 | 60 | 50 | "B12" | "Regular" | "B" | 95 | "Bourgogne" |
6114330 | 7 | 6 | 29 | 54 | "B12" | "Diesel" | "B" | 65 | "Aquitaine" |
In [3]:
exposure = pl.read_csv('./raw-data/exposure.csv')
exposure
Out[ ]:
shape: (678_013, 2)
IDpol | Exposure |
---|---|
i64 | f64 |
2124053 | 0.53 |
1049168 | 0.1 |
134313 | 1.0 |
1145209 | 0.06 |
2281532 | 0.5 |
… | … |
4134506 | 1.0 |
1037983 | 0.04 |
3197389 | 1.0 |
25934 | 1.0 |
72150 | 1.0 |
In [4]:
claims_amounts = pl.read_csv('./raw-data/claims_amounts.csv')
claims_amounts
Out[ ]:
shape: (35_560, 2)
IDpol | ClaimAmount |
---|---|
i64 | f64 |
1 | 0.0 |
3 | 0.0 |
5 | 0.0 |
10 | 0.0 |
11 | 0.0 |
… | … |
6113793 | 1769.88 |
6113817 | 1288.28 |
6113834 | 10290.0 |
6113834 | 1940.4 |
6113971 | 1452.0 |
Policies¶
In [5]:
# Assign random number between 1-5
policies = policies.with_columns([
pl.Series("Group", [random.randint(1, 5) for _ in range(policies.height)]).cast(pl.String)
])
# Convert Integer columns to Floats for modelling
policies = policies.with_columns([
pl.col(col).cast(pl.Float64) for col, dtype in zip(policies.columns, policies.dtypes) if dtype == pl.Int64
])
# Convert IDs to Integer type
policies = policies.with_columns(IDpol=pl.col('IDpol').cast(pl.Int64))
Claims¶
In [6]:
# For claim counts take the sum of claims for each ID
claim_counts = (
claims_amounts
.with_columns(ClaimCount = pl.lit(1))
.group_by('IDpol')
.agg(pl.sum('ClaimCount').alias('ClaimCount'))
)
Frequency Data¶
In [7]:
# Join policies, exposure, claim_counts datasets together
# Populate missing ClaimCount with 0
frequency_set = (
policies
.join(exposure, on = 'IDpol', how='inner')
.join(claim_counts, on = 'IDpol', how='left')
.with_columns(ClaimCount = pl.col('ClaimCount').fill_null(0))
)
# Write data as parquet
frequency_set.write_parquet('./processed-data/frequency_set.parquet')
Severity Data¶
In [8]:
# Join policies, claim_amounts and inner join claim_counts
# Populate missing ClaimAmount with 0
# Cap ClaimAmount at 50k.
severity_set = (
policies
.join(claims_amounts, on = 'IDpol', how='left')
.join(claim_counts, on = 'IDpol', how='inner')
.with_columns(ClaimAmount = pl.col('ClaimAmount').fill_null(0))
.drop('ClaimCount')
.with_columns(ClaimAmount = pl.when(pl.col('ClaimAmount') > 50_000).then(pl.lit(50_000)).otherwise(pl.col('ClaimAmount')))
)
severity_set.write_parquet('./processed-data/severity_set.parquet')