In [1]:
import polars as pl
import random

Read in CSV data¶

In [2]:
policies = pl.read_csv('./raw-data/policies.csv')
policies
Out[ ]:
shape: (678_013, 10)
IDpolVehPowerVehAgeDrivAgeBonusMalusVehBrandVehGasAreaDensityRegion
i64i64i64i64i64strstrstri64str
1505550"B12""Regular""D"1217"Rhone-Alpes"
3505550"B12""Regular""D"1217"Rhone-Alpes"
5625250"B12""Diesel""B"54"Picardie"
10704650"B12""Diesel""B"76"Aquitaine"
11704650"B12""Diesel""B"76"Aquitaine"
…………………………
6114326405450"B12""Regular""E"3317"Provence-Alpes-Cotes-D'Azur"
6114327404195"B12""Regular""E"9850"Ile-de-France"
6114328624550"B12""Diesel""D"1323"Rhone-Alpes"
6114329406050"B12""Regular""B"95"Bourgogne"
6114330762954"B12""Diesel""B"65"Aquitaine"
In [3]:
exposure = pl.read_csv('./raw-data/exposure.csv')
exposure
Out[ ]:
shape: (678_013, 2)
IDpolExposure
i64f64
21240530.53
10491680.1
1343131.0
11452090.06
22815320.5
……
41345061.0
10379830.04
31973891.0
259341.0
721501.0
In [4]:
claims_amounts = pl.read_csv('./raw-data/claims_amounts.csv')
claims_amounts
Out[ ]:
shape: (35_560, 2)
IDpolClaimAmount
i64f64
10.0
30.0
50.0
100.0
110.0
……
61137931769.88
61138171288.28
611383410290.0
61138341940.4
61139711452.0

Policies¶

In [5]:
# Assign random number between 1-5
policies = policies.with_columns([
    pl.Series("Group", [random.randint(1, 5) for _ in range(policies.height)]).cast(pl.String)
])

# Convert Integer columns to Floats for modelling
policies = policies.with_columns([
    pl.col(col).cast(pl.Float64) for col, dtype in zip(policies.columns, policies.dtypes) if dtype == pl.Int64
])

# Convert IDs to Integer type
policies = policies.with_columns(IDpol=pl.col('IDpol').cast(pl.Int64))

Claims¶

In [6]:
# For claim counts take the sum of claims for each ID
claim_counts = (
    claims_amounts
    .with_columns(ClaimCount = pl.lit(1))
    .group_by('IDpol')
    .agg(pl.sum('ClaimCount').alias('ClaimCount'))
)

Frequency Data¶

In [7]:
# Join policies, exposure, claim_counts datasets together
# Populate missing ClaimCount with 0
frequency_set = (
    policies
    .join(exposure, on = 'IDpol', how='inner')
    .join(claim_counts, on = 'IDpol', how='left')
    .with_columns(ClaimCount = pl.col('ClaimCount').fill_null(0))
)

# Write data as parquet
frequency_set.write_parquet('./processed-data/frequency_set.parquet')

Severity Data¶

In [8]:
# Join policies, claim_amounts and inner join claim_counts
# Populate missing ClaimAmount with 0
# Cap ClaimAmount at 50k.
severity_set = (
    policies
    .join(claims_amounts, on = 'IDpol', how='left')
    .join(claim_counts, on = 'IDpol', how='inner')
    .with_columns(ClaimAmount = pl.col('ClaimAmount').fill_null(0))
    .drop('ClaimCount')
    .with_columns(ClaimAmount = pl.when(pl.col('ClaimAmount') > 50_000).then(pl.lit(50_000)).otherwise(pl.col('ClaimAmount')))
)

severity_set.write_parquet('./processed-data/severity_set.parquet')