import os
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio import features
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from pathlib import Path
from IPython.display import display
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.offline
plotly.offline.init_notebook_mode()
print('All libraries successfully imported!')
print(f'Scikit-learn: {sklearn.__version__}')
All libraries successfully imported! Scikit-learn: 0.24.2
computer_path = '/export/miro/ndeffense/LBRAT2104/'
grp_letter    = 'X'
lut_path  = f'{computer_path}data/LUT/'
# Directory for all work files
work_path = f'{computer_path}GROUP_{grp_letter}/WORK/'
in_situ_path = f'{work_path}IN_SITU/'
classif_path = f'{work_path}CLASSIF/'
am_path = f'{work_path}ACCURACY_METRICS/'
Path(am_path).mkdir(parents=True, exist_ok=True)
print(f'Accuracy Metrics path is set to : {am_path}')
Accuracy Metrics path is set to : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/ACCURACY_METRICS/
site = 'NAMUR'
year = '2020'
feat_nb = 2
no_data = 0
ws = 3          # Window size radius (filtering post classification)
reclassif_flag = True
filter_flag    = True
# Field used for classification
field_classif_code = 'grp_1_nb'
field_classif_name = 'grp_1'
# Field used for reclassification
field_reclassif_code = 'grp_A_nb'
field_reclassif_name = 'grp_A'
if not reclassif_flag and not filter_flag:
    print(f'Reclassification : {reclassif_flag}')
    print(f'Filter           : {filter_flag}')
    classif_tif   = f'{classif_path}{site}_{year}_classif_RF_feat_{feat_nb}_{field_classif_name}.tif'
    field_code = field_classif_code
    field_name = field_classif_name
elif reclassif_flag and not filter_flag:
    print(f'Reclassification : {reclassif_flag}')
    print(f'Filter           : {filter_flag}')
    classif_tif = f'{classif_path}{site}_{year}_classif_RF_feat_{feat_nb}_{field_classif_name}_reclassify_{field_reclassif_name}.tif'
    field_code = field_reclassif_code
    field_name = field_reclassif_name
elif reclassif_flag and filter_flag:
    print(f'Reclassification : {reclassif_flag}')
    print(f'Filter           : {filter_flag}')
    classif_tif = f'{classif_path}{site}_{year}_classif_RF_feat_{feat_nb}_{field_classif_name}_reclassify_{field_reclassif_name}_filter_ws_{ws}.tif'
    field_code = field_reclassif_code
    field_name = field_reclassif_name
    
else:
    print('No classfication file is available !')
    classif_tif = ""
# LUT to get class names
s4s_lut_csv = f'{lut_path}crop_dictionary_new.csv'
# In situ data used for validation
in_situ_val_shp = f'{in_situ_path}{site}_{year}_IN_SITU_ROI_VAL.shp'
in_situ_val_tif = f'{in_situ_path}{site}_{year}_IN_SITU_ROI_VAL_{field_name}.tif'
# Confusion matrix and Accuracy metrics files
cm_csv  = f'{am_path}{os.path.basename(classif_tif)}_CM.csv'
cm_html = f'{am_path}{os.path.basename(classif_tif)}_CM.html'
am_html = f'{am_path}{os.path.basename(classif_tif)}_AM.html'
# USED ONLY FOR VISU ON WEBSITE !
#cm_html = f'/export/miro/ndeffense/LBRAT2104/GIT/eo-toolbox/figures/{site}_{year}_CM.html'
#am_html = f'/export/miro/ndeffense/LBRAT2104/GIT/eo-toolbox/figures/{site}_{year}_AM.html'
print(f'Classification file used : \n {classif_tif}')
print(f'Validation polygons used : \n {in_situ_val_shp}')
Reclassification : True Filter : True Classification file used : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/CLASSIF/NAMUR_2020_classif_RF_feat_2_grp_1_reclassify_grp_A_filter_ws_3.tif Validation polygons used : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/IN_SITU/NAMUR_2020_IN_SITU_ROI_VAL.shp
# Open the shapefile with GeoPandas
in_situ_gdf = gpd.read_file(in_situ_val_shp)
# Open the raster file you want to use as a template for rasterize
print(f'Raster template file : {classif_tif}')
src = rasterio.open(classif_tif, "r")
# Update metadata
out_meta = src.meta
out_meta.update(nodata=no_data)
crs_shp = str(in_situ_gdf.crs).split(":",1)[1]
crs_tif = str(src.crs).split(":",1)[1]
print(f'The CRS of in situ data is    : {crs_shp}')
print(f'The CRS of raster template is : {crs_tif}')
if crs_shp == crs_tif:
    print("CRS are the same")
    print(f'Rasterize starts : {in_situ_val_shp}')
    # Burn the features into the raster and write it out
    dst = rasterio.open(in_situ_val_tif, 'w+', **out_meta)
    dst_arr = dst.read(1)
    # This is where we create a generator of geom, value pairs to use in rasterizing
    geom_col = in_situ_gdf.geometry
    code_col = in_situ_gdf[field_code].astype(int)
    shapes = ((geom,value) for geom, value in zip(geom_col, code_col))
    in_situ_arr = features.rasterize(shapes=shapes,
                                     fill=no_data,
                                     out=dst_arr,
                                     transform=dst.transform)
    
    dst.write_band(1, in_situ_arr)
    print(f'Rasterize is done : {in_situ_val_tif}')
    # Close rasterio objects
    src.close()
    dst.close()
else:
    print('CRS are different --> repoject in-situ data shapefile with "to_crs"')
Raster template file : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/CLASSIF/NAMUR_2020_classif_RF_feat_2_grp_1_reclassify_grp_A_filter_ws_3.tif The CRS of in situ data is : 32631 The CRS of raster template is : 32631 CRS are the same Rasterize starts : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/IN_SITU/NAMUR_2020_IN_SITU_ROI_VAL.shp Rasterize is done : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/IN_SITU/NAMUR_2020_IN_SITU_ROI_VAL_grp_A.tif
y_pred and y_true¶# Open in-situ used for validation
src = rasterio.open(in_situ_val_tif, "r")
val_arr = src.read(1)
src.close()
# Open classification map
src = rasterio.open(classif_tif, "r")
classif_arr = src.read(1)
src.close()
# Get the postion of validation pixels
idx = np.where(val_arr == no_data, 0, 1).astype(bool)
# Ground truth (correct) target values
y_true = val_arr[idx]
print(f'Reference data (truth) : {y_true}')
# Estimated targets as returned by a classifier.
y_pred = classif_arr[idx]
print(f'Classification data    : {y_pred}')
Reference data (truth) : [ 3 3 3 ... 111 111 111] Classification data : [ 3 3 3 ... 111 111 111]
Sometimes, some classes do not appear in the classification map, they are not predicted by the Random Forest.
This means that some classes in y_true don't appear in y_pred.
classes_all  = sorted(np.unique(y_true))
classes_pred = sorted(np.unique(y_pred))
classes_missing = set(y_true) - set(y_pred)
print(f'{len(classes_missing)} classes are missing in the classification (y_pred) : {classes_missing} \n')
print(f'All training classes :\n {classes_all}')
print(f'All predicted classes (at least once) :\n {classes_pred}')
0 classes are missing in the classification (y_pred) : set() All training classes : [3, 6, 8, 9, 17, 21, 22, 111, 112, 115, 117, 119, 121, 143, 151, 181, 192] All predicted classes (at least once) : [3, 6, 8, 9, 17, 21, 22, 111, 112, 115, 117, 119, 121, 143, 151, 181, 192]
lut_df = pd.read_csv(s4s_lut_csv, sep=';')
classes_name = lut_df[lut_df[field_code].isin(classes_all)].sort_values(field_code)[field_name].drop_duplicates().to_list()
for code,name in zip(classes_all, classes_name):
    print(f'{code} - {name}')
3 - Grassland and meadows 6 - Forest 8 - Build-up surface 9 - Water bodies 17 - Leguminous crops 21 - Fruits trees 22 - Vineyards 111 - Wheat 112 - Maize 115 - Barley 117 - Oats 119 - Other cereals 121 - Leafy or stem vegetables 143 - Other oilseed crops 151 - Potatoes 181 - Sugar beet 192 - Fibre crops
cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(cm)
cm_values = cm_df.values
cm_df.columns = classes_all
cm_df.index = classes_all
# Export CM in a CSV file
cm_df.to_csv(cm_csv, index=True, sep=',')
display(cm_df)
| 3 | 6 | 8 | 9 | 17 | 21 | 22 | 111 | 112 | 115 | 117 | 119 | 121 | 143 | 151 | 181 | 192 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 17529 | 300 | 14 | 0 | 0 | 4 | 8 | 61 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 
| 6 | 41 | 745 | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
| 8 | 44 | 12 | 445 | 11 | 0 | 0 | 0 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
| 9 | 0 | 0 | 28 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
| 17 | 152 | 0 | 9 | 0 | 1138 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 
| 21 | 686 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
| 22 | 15 | 0 | 0 | 0 | 0 | 39 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
| 111 | 128 | 9 | 36 | 0 | 0 | 0 | 0 | 20207 | 0 | 1073 | 29 | 723 | 0 | 0 | 0 | 0 | 25 | 
| 112 | 76 | 0 | 21 | 0 | 30 | 0 | 0 | 1 | 6367 | 0 | 0 | 0 | 0 | 2 | 1 | 37 | 0 | 
| 115 | 308 | 1 | 0 | 0 | 0 | 0 | 0 | 1121 | 0 | 3033 | 127 | 405 | 0 | 1 | 0 | 0 | 0 | 
| 117 | 15 | 1 | 0 | 0 | 0 | 0 | 0 | 9 | 0 | 0 | 74 | 0 | 0 | 0 | 0 | 0 | 143 | 
| 119 | 29 | 0 | 7 | 0 | 0 | 0 | 0 | 4013 | 0 | 0 | 115 | 463 | 0 | 0 | 0 | 0 | 13 | 
| 121 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 741 | 0 | 0 | 0 | 29 | 0 | 0 | 10 | 0 | 
| 143 | 49 | 2 | 1 | 0 | 0 | 0 | 0 | 17 | 0 | 91 | 0 | 0 | 0 | 490 | 0 | 0 | 0 | 
| 151 | 2 | 16 | 3 | 0 | 0 | 0 | 0 | 0 | 132 | 0 | 0 | 0 | 0 | 0 | 3886 | 823 | 0 | 
| 181 | 6 | 11 | 0 | 0 | 0 | 2 | 1 | 0 | 440 | 0 | 0 | 0 | 24 | 0 | 0 | 5066 | 0 | 
| 192 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 685 | 0 | 0 | 31 | 16 | 0 | 0 | 0 | 0 | 19 | 
# invert z idx values
z = cm[::-1]
x = classes_name
y = x[::-1].copy() # invert idx values of x
# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]
# set up figure 
fig = ff.create_annotated_heatmap(z,
                                  x=x,
                                  y=y,
                                  annotation_text=z_text,
                                  colorscale='spectral',
                                  reversescale=True)
# add title
#fig.update_layout(title_text=f"Confusion Matrix - {site}, {year}")
# adjust margins to make room for yaxis title
#fig.update_layout(margin=dict(t=200, l=200))
#fig.update_xaxes(tickfont_size=20)
#fig.update_yaxes(tickfont_size=20)
#fig.update_layout(font_size=25)
# add colorbar
#fig['data'][0]['showscale'] = True
fig.show()
fig.write_html(cm_html, full_html=False)
If you decide that you are not interested in the scores of classes that were not predicted, then you can explicitly specify the classes you are interested in (which are labels that were predicted at least once).
acc_metrics_str  = classification_report(y_true,
                                         y_pred,
                                         target_names=classes_name,
                                         labels=classes_all,
                                         digits=3)
print(acc_metrics_str)
                          precision    recall  f1-score   support
   Grassland and meadows      0.918     0.978     0.947     17918
                  Forest      0.678     0.916     0.779       813
        Build-up surface      0.753     0.846     0.797       526
            Water bodies      0.718     0.500     0.589        56
        Leguminous crops      0.974     0.873     0.921      1303
            Fruits trees      0.000     0.000     0.000       688
               Vineyards      0.000     0.000     0.000        54
                   Wheat      0.773     0.909     0.836     22230
                   Maize      0.829     0.974     0.896      6535
                  Barley      0.723     0.607     0.660      4996
                    Oats      0.196     0.306     0.239       242
           Other cereals      0.288     0.100     0.148      4640
Leafy or stem vegetables      0.547     0.037     0.069       785
     Other oilseed crops      0.994     0.754     0.857       650
                Potatoes      0.999     0.799     0.888      4862
              Sugar beet      0.853     0.913     0.882      5550
             Fibre crops      0.095     0.025     0.040       751
                accuracy                          0.820     72599
               macro avg      0.608     0.561     0.562     72599
            weighted avg      0.786     0.820     0.793     72599
oa = accuracy_score(y_true, y_pred)
oa = round(oa*100, 2)
print(f'Overall Accuracy : {oa}%')
Overall Accuracy : 81.98%
acc_metrics_dict = classification_report(y_true, y_pred,target_names=classes_name, output_dict=True)
am_df = pd.DataFrame.from_dict(acc_metrics_dict).round(3)
am_df = am_df.iloc[:,:-3]
#am_df = pd.concat([am_df, nb_df])
#am_df = am_df.sort_values(by='pix_count', ascending=False, axis=1)
class_name = am_df.columns.to_list()
precision = am_df.loc['precision'].to_list()
recall    = am_df.loc['recall'].to_list()
f1_score  = am_df.loc['f1-score'].to_list()
fig = go.Figure(data=[
    go.Bar(name='Precision', x=class_name, y=precision, text=precision, textposition='auto'),
    go.Bar(name='Recall', x=class_name, y=recall, text=recall, textposition='auto'),
    go.Bar(name='F1-score', x=class_name, y=f1_score, text=f1_score, textposition='auto')
])
# Change the bar mode
fig.update_layout(title_text=f'Accuracy Metrics - {site}, {year}',
                  barmode='group')
#fig.update_xaxes(tickfont_size=30)
fig.update_yaxes(tickfont_size=10, range=[0,1])
#fig.update_layout(xaxis_title=None, font_size=10)
#fig.update_layout(legend=dict(font=dict(size=25)))
fig.show()
fig.write_html(am_html, full_html=False)