import polars as pl
import random

policies = pl.read_csv('./raw-data/policies.csv')
policies

exposure = pl.read_csv('./raw-data/exposure.csv')
exposure

claims_amounts = pl.read_csv('./raw-data/claims_amounts.csv')
claims_amounts

# Assign random number between 1-5
policies = policies.with_columns([
    pl.Series("Group", [random.randint(1, 5) for _ in range(policies.height)]).cast(pl.String)
])

# Convert Integer columns to Floats for modelling
policies = policies.with_columns([
    pl.col(col).cast(pl.Float64) for col, dtype in zip(policies.columns, policies.dtypes) if dtype == pl.Int64
])

# Convert IDs to Integer type
policies = policies.with_columns(IDpol=pl.col('IDpol').cast(pl.Int64))

# For claim counts take the sum of claims for each ID
claim_counts = (
    claims_amounts
    .with_columns(ClaimCount = pl.lit(1))
    .group_by('IDpol')
    .agg(pl.sum('ClaimCount').alias('ClaimCount'))
)

# Join policies, exposure, claim_counts datasets together
# Populate missing ClaimCount with 0
frequency_set = (
    policies
    .join(exposure, on = 'IDpol', how='inner')
    .join(claim_counts, on = 'IDpol', how='left')
    .with_columns(ClaimCount = pl.col('ClaimCount').fill_null(0))
)

# Write data as parquet
frequency_set.write_parquet('./processed-data/frequency_set.parquet')

# Join policies, claim_amounts and inner join claim_counts
# Populate missing ClaimAmount with 0
# Cap ClaimAmount at 50k.
severity_set = (
    policies
    .join(claims_amounts, on = 'IDpol', how='left')
    .join(claim_counts, on = 'IDpol', how='inner')
    .with_columns(ClaimAmount = pl.col('ClaimAmount').fill_null(0))
    .drop('ClaimCount')
    .with_columns(ClaimAmount = pl.when(pl.col('ClaimAmount') > 50_000).then(pl.lit(50_000)).otherwise(pl.col('ClaimAmount')))
)

severity_set.write_parquet('./processed-data/severity_set.parquet')

IDpol	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Area	Density	Region
i64	i64	i64	i64	i64	str	str	str	i64	str
1	5	0	55	50	"B12"	"Regular"	"D"	1217	"Rhone-Alpes"
3	5	0	55	50	"B12"	"Regular"	"D"	1217	"Rhone-Alpes"
5	6	2	52	50	"B12"	"Diesel"	"B"	54	"Picardie"
10	7	0	46	50	"B12"	"Diesel"	"B"	76	"Aquitaine"
11	7	0	46	50	"B12"	"Diesel"	"B"	76	"Aquitaine"
…	…	…	…	…	…	…	…	…	…
6114326	4	0	54	50	"B12"	"Regular"	"E"	3317	"Provence-Alpes-Cotes-D'Azur"
6114327	4	0	41	95	"B12"	"Regular"	"E"	9850	"Ile-de-France"
6114328	6	2	45	50	"B12"	"Diesel"	"D"	1323	"Rhone-Alpes"
6114329	4	0	60	50	"B12"	"Regular"	"B"	95	"Bourgogne"
6114330	7	6	29	54	"B12"	"Diesel"	"B"	65	"Aquitaine"

IDpol	Exposure
i64	f64
2124053	0.53
1049168	0.1
134313	1.0
1145209	0.06
2281532	0.5
…	…
4134506	1.0
1037983	0.04
3197389	1.0
25934	1.0
72150	1.0

IDpol	ClaimAmount
i64	f64
1	0.0
3	0.0
5	0.0
10	0.0
11	0.0
…	…
6113793	1769.88
6113817	1288.28
6113834	10290.0
6113834	1940.4
6113971	1452.0

Read in CSV data¶

Policies¶

Claims¶

Frequency Data¶

Severity Data¶