| 1 | #!/usr/bin/env python
|
|---|
| 2 | # -- coding: utf-8 --
|
|---|
| 3 | #
|
|---|
| 4 | ############################################################################
|
|---|
| 5 | #
|
|---|
| 6 | # MODULE: v.class.ml
|
|---|
| 7 | #
|
|---|
| 8 | # AUTHOR(S): Pietro Zambelli (University of Trento)
|
|---|
| 9 | #
|
|---|
| 10 | # COPYRIGHT: (C) 2013 by the GRASS Development Team
|
|---|
| 11 | #
|
|---|
| 12 | # This program is free software under the GNU General Public
|
|---|
| 13 | # License (>=v2). Read the file COPYING that comes with GRASS
|
|---|
| 14 | # for details.
|
|---|
| 15 | #
|
|---|
| 16 | #############################################################################
|
|---|
| 17 |
|
|---|
| 18 | #%Module
|
|---|
| 19 | #% description: Classification of a vector maps based on the values in attribute tables
|
|---|
| 20 | #% keyword: vector
|
|---|
| 21 | #% keyword: classification
|
|---|
| 22 | #% keyword: machine learning
|
|---|
| 23 | #% overwrite: yes
|
|---|
| 24 | #%End
|
|---|
| 25 | #%option G_OPT_V_MAP
|
|---|
| 26 | #% key: vector
|
|---|
| 27 | #% description: Name of input vector map
|
|---|
| 28 | #% required: yes
|
|---|
| 29 | #%end
|
|---|
| 30 | #%option G_OPT_V_MAP
|
|---|
| 31 | #% key: vtraining
|
|---|
| 32 | #% description: Name of training vector map
|
|---|
| 33 | #% required: no
|
|---|
| 34 | #%end
|
|---|
| 35 | #%option
|
|---|
| 36 | #% key: vlayer
|
|---|
| 37 | #% type: string
|
|---|
| 38 | #% multiple: no
|
|---|
| 39 | #% description: layer name or number to use for data
|
|---|
| 40 | #% required: no
|
|---|
| 41 | #%end
|
|---|
| 42 | #%option
|
|---|
| 43 | #% key: tlayer
|
|---|
| 44 | #% type: string
|
|---|
| 45 | #% multiple: no
|
|---|
| 46 | #% description: layer number/name for the training layer
|
|---|
| 47 | #% required: no
|
|---|
| 48 | #%end
|
|---|
| 49 | #%option
|
|---|
| 50 | #% key: rlayer
|
|---|
| 51 | #% type: string
|
|---|
| 52 | #% multiple: no
|
|---|
| 53 | #% description: layer number/name for the ML results
|
|---|
| 54 | #% required: no
|
|---|
| 55 | #%end
|
|---|
| 56 | #%option
|
|---|
| 57 | #% key: npy_data
|
|---|
| 58 | #% type: string
|
|---|
| 59 | #% multiple: no
|
|---|
| 60 | #% description: Data with statistics in npy format.
|
|---|
| 61 | #% answer: data.npy
|
|---|
| 62 | #% required: no
|
|---|
| 63 | #%end
|
|---|
| 64 | #%option
|
|---|
| 65 | #% key: npy_cats
|
|---|
| 66 | #% type: string
|
|---|
| 67 | #% multiple: no
|
|---|
| 68 | #% description: Numpy array with vector cats.
|
|---|
| 69 | #% answer: cats.npy
|
|---|
| 70 | #% required: no
|
|---|
| 71 | #%end
|
|---|
| 72 | #%option
|
|---|
| 73 | #% key: npy_cols
|
|---|
| 74 | #% type: string
|
|---|
| 75 | #% multiple: no
|
|---|
| 76 | #% description: Numpy array with columns names.
|
|---|
| 77 | #% answer: cols.npy
|
|---|
| 78 | #% required: no
|
|---|
| 79 | #%end
|
|---|
| 80 | #%option
|
|---|
| 81 | #% key: npy_index
|
|---|
| 82 | #% type: string
|
|---|
| 83 | #% multiple: no
|
|---|
| 84 | #% description: Boolean numpy array with training indexes.
|
|---|
| 85 | #% answer: indx.npy
|
|---|
| 86 | #% required: no
|
|---|
| 87 | #%end
|
|---|
| 88 | #%option
|
|---|
| 89 | #% key: npy_tdata
|
|---|
| 90 | #% type: string
|
|---|
| 91 | #% multiple: no
|
|---|
| 92 | #% description: training npy file with training set, default: training_data.npy
|
|---|
| 93 | #% answer: training_data.npy
|
|---|
| 94 | #% required: no
|
|---|
| 95 | #%end
|
|---|
| 96 | #%option
|
|---|
| 97 | #% key: npy_tclasses
|
|---|
| 98 | #% type: string
|
|---|
| 99 | #% multiple: no
|
|---|
| 100 | #% description: training npy file with the classes, default: training_classes.npy
|
|---|
| 101 | #% answer: training_classes.npy
|
|---|
| 102 | #% required: no
|
|---|
| 103 | #%end
|
|---|
| 104 | #%option
|
|---|
| 105 | #% key: npy_btdata
|
|---|
| 106 | #% type: string
|
|---|
| 107 | #% multiple: no
|
|---|
| 108 | #% description: training npy file with training set, default: training_data.npy
|
|---|
| 109 | #% answer: Xbt.npy
|
|---|
| 110 | #% required: no
|
|---|
| 111 | #%end
|
|---|
| 112 | #%option
|
|---|
| 113 | #% key: npy_btclasses
|
|---|
| 114 | #% type: string
|
|---|
| 115 | #% multiple: no
|
|---|
| 116 | #% description: training npy file with the classes, default: training_classes.npy
|
|---|
| 117 | #% answer: Ybt.npy
|
|---|
| 118 | #% required: no
|
|---|
| 119 | #%end
|
|---|
| 120 | #%option
|
|---|
| 121 | #% key: imp_csv
|
|---|
| 122 | #% type: string
|
|---|
| 123 | #% multiple: no
|
|---|
| 124 | #% description: CSV file name with the feature importances rank using extra tree algorithms
|
|---|
| 125 | #% answer: features_importances.csv
|
|---|
| 126 | #% required: no
|
|---|
| 127 | #%end
|
|---|
| 128 | #%option
|
|---|
| 129 | #% key: imp_fig
|
|---|
| 130 | #% type: string
|
|---|
| 131 | #% multiple: no
|
|---|
| 132 | #% description: Figure file name with feature importances rank using extra tree algorithms
|
|---|
| 133 | #% answer: features_importances.png
|
|---|
| 134 | #% required: no
|
|---|
| 135 | #%end
|
|---|
| 136 | #%option
|
|---|
| 137 | #% key: scalar
|
|---|
| 138 | #% type: string
|
|---|
| 139 | #% multiple: yes
|
|---|
| 140 | #% description: scaler method, center the data before scaling, if no, not scale at all
|
|---|
| 141 | #% required: no
|
|---|
| 142 | #% answer: with_mean,with_std
|
|---|
| 143 | #%end
|
|---|
| 144 | #%option
|
|---|
| 145 | #% key: decomposition
|
|---|
| 146 | #% type: string
|
|---|
| 147 | #% multiple: no
|
|---|
| 148 | #% description: choose a decomposition method (PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, FastICA, TruncatedSVD) and set the parameters using the | to separate the decomposition method from the parameters like: PCA|n_components=98
|
|---|
| 149 | #% required: no
|
|---|
| 150 | #% answer:
|
|---|
| 151 | #%end
|
|---|
| 152 | #%option
|
|---|
| 153 | #% key: n_training
|
|---|
| 154 | #% type: integer
|
|---|
| 155 | #% multiple: no
|
|---|
| 156 | #% description: Number of random training per class to training the machine learning algorithms
|
|---|
| 157 | #% required: no
|
|---|
| 158 | #%end
|
|---|
| 159 | #%option
|
|---|
| 160 | #% key: pyclassifiers
|
|---|
| 161 | #% type: string
|
|---|
| 162 | #% multiple: no
|
|---|
| 163 | #% description: a python file with classifiers
|
|---|
| 164 | #% required: no
|
|---|
| 165 | #%end
|
|---|
| 166 | #%option
|
|---|
| 167 | #% key: pyvar
|
|---|
| 168 | #% type: string
|
|---|
| 169 | #% multiple: no
|
|---|
| 170 | #% description: name of the python variable that must be a list of dictionary
|
|---|
| 171 | #% required: no
|
|---|
| 172 | #%end
|
|---|
| 173 | #%option
|
|---|
| 174 | #% key: pyindx
|
|---|
| 175 | #% type: string
|
|---|
| 176 | #% multiple: no
|
|---|
| 177 | #% description: specify the index or range of index of the classifiers that you want to use
|
|---|
| 178 | #% required: no
|
|---|
| 179 | #%end
|
|---|
| 180 | #%option
|
|---|
| 181 | #% key: pyindx_optimize
|
|---|
| 182 | #% type: string
|
|---|
| 183 | #% multiple: no
|
|---|
| 184 | #% description: Index of the classifiers to optimize the training set
|
|---|
| 185 | #% required: no
|
|---|
| 186 | #%end
|
|---|
| 187 | #%option
|
|---|
| 188 | #% key: nan
|
|---|
| 189 | #% type: string
|
|---|
| 190 | #% multiple: yes
|
|---|
| 191 | #% description: Column pattern:Value or Numpy funtion to use to substitute NaN values
|
|---|
| 192 | #% required: no
|
|---|
| 193 | #% answer: *_skewness:nanmean,*_kurtosis:nanmean
|
|---|
| 194 | #%end
|
|---|
| 195 | #%option
|
|---|
| 196 | #% key: inf
|
|---|
| 197 | #% type: string
|
|---|
| 198 | #% multiple: yes
|
|---|
| 199 | #% description: Key:Value or Numpy funtion to use to substitute Inf values
|
|---|
| 200 | #% required: no
|
|---|
| 201 | #% answer: *_skewness:nanmean,*_kurtosis:nanmean
|
|---|
| 202 | #%end
|
|---|
| 203 | #%option
|
|---|
| 204 | #% key: neginf
|
|---|
| 205 | #% type: string
|
|---|
| 206 | #% multiple: yes
|
|---|
| 207 | #% description: Key:Value or Numpy funtion to use to substitute neginf values
|
|---|
| 208 | #% required: no
|
|---|
| 209 | #% answer:
|
|---|
| 210 | #%end
|
|---|
| 211 | #%option
|
|---|
| 212 | #% key: posinf
|
|---|
| 213 | #% type: string
|
|---|
| 214 | #% multiple: yes
|
|---|
| 215 | #% description: Key:Value or Numpy funtion to use to substitute posinf values
|
|---|
| 216 | #% required: no
|
|---|
| 217 | #% answer:
|
|---|
| 218 | #%end
|
|---|
| 219 | #%option
|
|---|
| 220 | #% key: csv_test_cls
|
|---|
| 221 | #% type: string
|
|---|
| 222 | #% multiple: no
|
|---|
| 223 | #% description: csv file name with results of different machine learning scores
|
|---|
| 224 | #% required: no
|
|---|
| 225 | #% answer: test_classifiers.csv
|
|---|
| 226 | #%end
|
|---|
| 227 | #%option
|
|---|
| 228 | #% key: report_class
|
|---|
| 229 | #% type: string
|
|---|
| 230 | #% multiple: no
|
|---|
| 231 | #% description: text file name with the report of different machine learning algorithms
|
|---|
| 232 | #% required: no
|
|---|
| 233 | #% answer: classification_report.txt
|
|---|
| 234 | #%end
|
|---|
| 235 | #%option
|
|---|
| 236 | #% key: svc_c_range
|
|---|
| 237 | #% type: double
|
|---|
| 238 | #% multiple: yes
|
|---|
| 239 | #% description: C value range list to explore SVC domain
|
|---|
| 240 | #% required: no
|
|---|
| 241 | #% answer: 1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5,1e6,1e7,1e8
|
|---|
| 242 | #%end
|
|---|
| 243 | #%option
|
|---|
| 244 | #% key: svc_gamma_range
|
|---|
| 245 | #% type: double
|
|---|
| 246 | #% multiple: yes
|
|---|
| 247 | #% description: gamma value range list to explore SVC domain
|
|---|
| 248 | #% required: no
|
|---|
| 249 | #% answer: 1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4
|
|---|
| 250 | #%end
|
|---|
| 251 | #%option
|
|---|
| 252 | #% key: svc_kernel_range
|
|---|
| 253 | #% type: string
|
|---|
| 254 | #% multiple: yes
|
|---|
| 255 | #% description: kernel value range list to explore SVC domain
|
|---|
| 256 | #% required: no
|
|---|
| 257 | #% answer: linear,poly,rbf,sigmoid
|
|---|
| 258 | #%end
|
|---|
| 259 | #%option
|
|---|
| 260 | #% key: svc_poly_range
|
|---|
| 261 | #% type: string
|
|---|
| 262 | #% multiple: yes
|
|---|
| 263 | #% description: polynomial order list to explore SVC domain
|
|---|
| 264 | #% required: no
|
|---|
| 265 | #% answer:
|
|---|
| 266 | #%end
|
|---|
| 267 | #%option
|
|---|
| 268 | #% key: svc_n_jobs
|
|---|
| 269 | #% type: integer
|
|---|
| 270 | #% multiple: no
|
|---|
| 271 | #% description: number of jobs to use during the domain exploration
|
|---|
| 272 | #% required: no
|
|---|
| 273 | #% answer: 1
|
|---|
| 274 | #%end
|
|---|
| 275 | #%option
|
|---|
| 276 | #% key: svc_c
|
|---|
| 277 | #% type: double
|
|---|
| 278 | #% multiple: no
|
|---|
| 279 | #% description: definitive C value
|
|---|
| 280 | #% required: no
|
|---|
| 281 | #%end
|
|---|
| 282 | #%option
|
|---|
| 283 | #% key: svc_gamma
|
|---|
| 284 | #% type: double
|
|---|
| 285 | #% multiple: no
|
|---|
| 286 | #% description: definitive gamma value
|
|---|
| 287 | #% required: no
|
|---|
| 288 | #%end
|
|---|
| 289 | #%option
|
|---|
| 290 | #% key: svc_kernel
|
|---|
| 291 | #% type: string
|
|---|
| 292 | #% multiple: no
|
|---|
| 293 | #% description: definitive kernel value. Available kernel are: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
|
|---|
| 294 | #% required: no
|
|---|
| 295 | #% answer: rbf
|
|---|
| 296 | #%end
|
|---|
| 297 | #%option
|
|---|
| 298 | #% key: svc_img
|
|---|
| 299 | #% type: string
|
|---|
| 300 | #% multiple: no
|
|---|
| 301 | #% description: filename pattern with the image of SVC parameter
|
|---|
| 302 | #% required: no
|
|---|
| 303 | #% answer: domain_%s.svg
|
|---|
| 304 | #%end
|
|---|
| 305 | #%option
|
|---|
| 306 | #% key: rst_names
|
|---|
| 307 | #% type: string
|
|---|
| 308 | #% multiple: no
|
|---|
| 309 | #% description: filename pattern for raster
|
|---|
| 310 | #% required: no
|
|---|
| 311 | #% answer: %s
|
|---|
| 312 | #%end
|
|---|
| 313 | #-----------------------------------------------------
|
|---|
| 314 | #%flag
|
|---|
| 315 | #% key: e
|
|---|
| 316 | #% description: Extract the training set from the vtraining map
|
|---|
| 317 | #%end
|
|---|
| 318 | #%flag
|
|---|
| 319 | #% key: n
|
|---|
| 320 | #% description: Export to numpy files
|
|---|
| 321 | #%end
|
|---|
| 322 | #%flag
|
|---|
| 323 | #% key: f
|
|---|
| 324 | #% description: Feature importances using extra trees algorithm
|
|---|
| 325 | #%end
|
|---|
| 326 | #%flag
|
|---|
| 327 | #% key: b
|
|---|
| 328 | #% description: Balance the training using the class with the minor number of data
|
|---|
| 329 | #%end
|
|---|
| 330 | #%flag
|
|---|
| 331 | #% key: o
|
|---|
| 332 | #% description: Optimize the training samples
|
|---|
| 333 | #%end
|
|---|
| 334 | #%flag
|
|---|
| 335 | #% key: c
|
|---|
| 336 | #% description: Classify the whole dataset
|
|---|
| 337 | #%end
|
|---|
| 338 | #%flag
|
|---|
| 339 | #% key: r
|
|---|
| 340 | #% description: Export the classify results to raster maps
|
|---|
| 341 | #%end
|
|---|
| 342 | #%flag
|
|---|
| 343 | #% key: t
|
|---|
| 344 | #% description: Test different classification methods
|
|---|
| 345 | #%end
|
|---|
| 346 | #%flag
|
|---|
| 347 | #% key: v
|
|---|
| 348 | #% description: add to test to compute the Bias variance
|
|---|
| 349 | #%end
|
|---|
| 350 | #%flag
|
|---|
| 351 | #% key: x
|
|---|
| 352 | #% description: add to test to compute extra parameters like: confusion matrix, ROC, PR
|
|---|
| 353 | #%end
|
|---|
| 354 | #%flag
|
|---|
| 355 | #% key: d
|
|---|
| 356 | #% description: Explore the SVC domain
|
|---|
| 357 | #%end
|
|---|
| 358 | #%flag
|
|---|
| 359 | #% key: a
|
|---|
| 360 | #% description: append the classification results
|
|---|
| 361 | #%end
|
|---|
| 362 | #-----------------------------------------------------
|
|---|
| 363 | from __future__ import (absolute_import, division, print_function,
|
|---|
| 364 | unicode_literals)
|
|---|
| 365 | import imp
|
|---|
| 366 | import sys
|
|---|
| 367 | import os
|
|---|
| 368 | from pprint import pprint
|
|---|
| 369 | from fnmatch import fnmatch
|
|---|
| 370 |
|
|---|
| 371 | import numpy as np
|
|---|
| 372 |
|
|---|
| 373 | from grass.pygrass.utils import set_path
|
|---|
| 374 | from grass.pygrass.messages import get_msgr
|
|---|
| 375 | from grass.pygrass.vector import Vector
|
|---|
| 376 | from grass.pygrass.modules import Module
|
|---|
| 377 | from grass.script.core import parser, overwrite
|
|---|
| 378 |
|
|---|
| 379 | set_path('v.class.ml')
|
|---|
| 380 |
|
|---|
| 381 | from training_extraction import extract_training
|
|---|
| 382 | from sqlite2npy import save2npy
|
|---|
| 383 | from npy2table import export_results
|
|---|
| 384 |
|
|---|
| 385 |
|
|---|
| 386 | DECMP = {}
|
|---|
| 387 |
|
|---|
| 388 |
|
|---|
| 389 | def load_decompositions():
|
|---|
| 390 | """Import decompositions and update dictionary which stores them"""
|
|---|
| 391 | from sklearn.decomposition import (PCA, KernelPCA, ProbabilisticPCA,
|
|---|
| 392 | RandomizedPCA, FastICA, TruncatedSVD)
|
|---|
| 393 | from sklearn.lda import LDA
|
|---|
| 394 | DECMP.update({
|
|---|
| 395 | 'PCA': PCA,
|
|---|
| 396 | 'KernelPCA': KernelPCA,
|
|---|
| 397 | 'ProbabilisticPCA': ProbabilisticPCA,
|
|---|
| 398 | 'RandomizedPCA': RandomizedPCA,
|
|---|
| 399 | 'FastICA': FastICA,
|
|---|
| 400 | 'TruncatedSVD': TruncatedSVD,
|
|---|
| 401 | 'LDA': LDA
|
|---|
| 402 | })
|
|---|
| 403 |
|
|---|
| 404 |
|
|---|
| 405 | def get_indexes(string, sep=',', rangesep='-'):
|
|---|
| 406 | """
|
|---|
| 407 | >>> indx = '1-5,34-36,40'
|
|---|
| 408 | >>> [i for i in get_indexes(indx)]
|
|---|
| 409 | [1, 2, 3, 4, 5, 34, 35, 36, 40]
|
|---|
| 410 | """
|
|---|
| 411 | for ind in string.split(sep):
|
|---|
| 412 | if rangesep in ind:
|
|---|
| 413 | start, stop = ind.split(rangesep)
|
|---|
| 414 | for i in range(int(start), int(stop) + 1):
|
|---|
| 415 | yield i
|
|---|
| 416 | else:
|
|---|
| 417 | yield int(ind)
|
|---|
| 418 |
|
|---|
| 419 |
|
|---|
| 420 | def get_colors(vtraining):
|
|---|
| 421 | vect, mset = vtraining.split('@') if '@' in vtraining else (vtraining, '')
|
|---|
| 422 | with Vector(vect, mapset=mset, mode='r') as vct:
|
|---|
| 423 | cur = vct.table.execute('SELECT cat, color FROM %s;' % vct.name)
|
|---|
| 424 | return dict([c for c in cur.fetchall()])
|
|---|
| 425 |
|
|---|
| 426 |
|
|---|
| 427 | def convert(string):
|
|---|
| 428 | try:
|
|---|
| 429 | return float(string)
|
|---|
| 430 | except:
|
|---|
| 431 | try:
|
|---|
| 432 | return getattr(np, string)
|
|---|
| 433 | except AttributeError:
|
|---|
| 434 | msg = "Not a valid option, is not a number or a numpy function."
|
|---|
| 435 | raise TypeError(msg)
|
|---|
| 436 |
|
|---|
| 437 |
|
|---|
| 438 | def get_rules(string):
|
|---|
| 439 | res = {}
|
|---|
| 440 | pairs = [s.strip().split(':') for s in string.strip().split(',')]
|
|---|
| 441 | for key, val in pairs:
|
|---|
| 442 | res[key] = convert(val)
|
|---|
| 443 | return res
|
|---|
| 444 |
|
|---|
| 445 |
|
|---|
| 446 | def find_special_cols(array, cols, report=True,
|
|---|
| 447 | special=('nan', 'inf', 'neginf', 'posinf')):
|
|---|
| 448 | sp = {key: [] for key in special}
|
|---|
| 449 | cntr = {key: [] for key in special}
|
|---|
| 450 | for i in range(len(cols)):
|
|---|
| 451 | for key in special:
|
|---|
| 452 | barray = getattr(np, 'is%s' % key)(array[:, i])
|
|---|
| 453 | if barray.any():
|
|---|
| 454 | sp[key].append(i)
|
|---|
| 455 | cntr[key].append(barray.sum())
|
|---|
| 456 | if report:
|
|---|
| 457 | indent = ' '
|
|---|
| 458 | tot = len(array)
|
|---|
| 459 | for k in special:
|
|---|
| 460 | fmt = '- %15s (%3d/%d, %4.3f%%)'
|
|---|
| 461 | if sp[k]:
|
|---|
| 462 | strs = [fmt % (col, cnt, tot, cnt/float(tot)*100)
|
|---|
| 463 | for col, cnt in zip(cols[np.array(sp[k])], cntr[k])]
|
|---|
| 464 | print('%s:\n%s' % (k, indent), ('\n%s' % indent).join(strs),
|
|---|
| 465 | sep='')
|
|---|
| 466 | return sp
|
|---|
| 467 |
|
|---|
| 468 |
|
|---|
| 469 | def substitute(X, rules, cols):
|
|---|
| 470 | vals = {}
|
|---|
| 471 | special_cols = find_special_cols(X, cols)
|
|---|
| 472 | pprint(special_cols)
|
|---|
| 473 | for key in rules.keys():
|
|---|
| 474 | vals[key] = {}
|
|---|
| 475 | for i in special_cols[key]:
|
|---|
| 476 | for rule in rules[key]:
|
|---|
| 477 | if fnmatch(cols[i], rule):
|
|---|
| 478 | indx = getattr(np, 'is%s' % key)(X[:, i])
|
|---|
| 479 | val = (rules[key][rule] if np.isscalar(rules[key][rule])
|
|---|
| 480 | else rules[key][rule](X[:, i][~indx]))
|
|---|
| 481 | X[:, i][indx] = val
|
|---|
| 482 | vals[key][cols[i]] = val
|
|---|
| 483 | return X, vals
|
|---|
| 484 |
|
|---|
| 485 |
|
|---|
| 486 | def extract_classes(vect, layer):
|
|---|
| 487 | vect, mset = vect.split('@') if '@'in vect else (vect, '')
|
|---|
| 488 | with Vector(vect, mapset=mset, layer=layer, mode='r') as vct:
|
|---|
| 489 | vct.table.filters.select('cat', 'class')
|
|---|
| 490 | return {key: val for key, val in vct.table.execute()}
|
|---|
| 491 |
|
|---|
| 492 |
|
|---|
| 493 | def main(opt, flg):
|
|---|
| 494 | # import functions which depend on sklearn only after parser run
|
|---|
| 495 | from ml_functions import (balance, explorer_clsfiers, run_classifier,
|
|---|
| 496 | optimize_training, explore_SVC, plot_grid)
|
|---|
| 497 | from features import importances, tocsv
|
|---|
| 498 |
|
|---|
| 499 | msgr = get_msgr()
|
|---|
| 500 | indexes = None
|
|---|
| 501 | vect = opt['vector']
|
|---|
| 502 | vtraining = opt['vtraining'] if opt['vtraining'] else None
|
|---|
| 503 | scaler, decmp = None, None
|
|---|
| 504 | vlayer = opt['vlayer'] if opt['vlayer'] else vect + '_stats'
|
|---|
| 505 | tlayer = opt['tlayer'] if opt['tlayer'] else vect + '_training'
|
|---|
| 506 | rlayer = opt['rlayer'] if opt['rlayer'] else vect + '_results'
|
|---|
| 507 |
|
|---|
| 508 | labels = extract_classes(vtraining, 1)
|
|---|
| 509 | pprint(labels)
|
|---|
| 510 |
|
|---|
| 511 | if opt['scalar']:
|
|---|
| 512 | scapar = opt['scalar'].split(',')
|
|---|
| 513 | from sklearn.preprocessing import StandardScaler
|
|---|
| 514 | scaler = StandardScaler(with_mean='with_mean' in scapar,
|
|---|
| 515 | with_std='with_std' in scapar)
|
|---|
| 516 |
|
|---|
| 517 | if opt['decomposition']:
|
|---|
| 518 | dec, params = (opt['decomposition'].split('|')
|
|---|
| 519 | if '|' in opt['decomposition']
|
|---|
| 520 | else (opt['decomposition'], ''))
|
|---|
| 521 | kwargs = ({k: v for k, v in (p.split('=') for p in params.split(','))}
|
|---|
| 522 | if params else {})
|
|---|
| 523 | load_decompositions()
|
|---|
| 524 | decmp = DECMP[dec](**kwargs)
|
|---|
| 525 |
|
|---|
| 526 | # if training extract training
|
|---|
| 527 | if vtraining and flg['e']:
|
|---|
| 528 | msgr.message("Extract training from: <%s> to <%s>." % (vtraining, vect))
|
|---|
| 529 | extract_training(vect, vtraining, tlayer)
|
|---|
| 530 | flg['n'] = True
|
|---|
| 531 |
|
|---|
| 532 | if flg['n']:
|
|---|
| 533 | msgr.message("Save arrays to npy files.")
|
|---|
| 534 | save2npy(vect, vlayer, tlayer,
|
|---|
| 535 | fcats=opt['npy_cats'], fcols=opt['npy_cols'],
|
|---|
| 536 | fdata=opt['npy_data'], findx=opt['npy_index'],
|
|---|
| 537 | fclss=opt['npy_tclasses'], ftdata=opt['npy_tdata'])
|
|---|
| 538 |
|
|---|
| 539 | # define the classifiers to use/test
|
|---|
| 540 | if opt['pyclassifiers'] and opt['pyvar']:
|
|---|
| 541 | # import classifiers to use
|
|---|
| 542 | mycls = imp.load_source("mycls", opt['pyclassifiers'])
|
|---|
| 543 | classifiers = getattr(mycls, opt['pyvar'])
|
|---|
| 544 | else:
|
|---|
| 545 | from ml_classifiers import CLASSIFIERS
|
|---|
| 546 | classifiers = CLASSIFIERS
|
|---|
| 547 |
|
|---|
| 548 | # Append the SVC classifier
|
|---|
| 549 | if opt['svc_c'] and opt['svc_gamma']:
|
|---|
| 550 | from sklearn.svm import SVC
|
|---|
| 551 | svc = {'name': 'SVC', 'classifier': SVC,
|
|---|
| 552 | 'kwargs': {'C': float(opt['svc_c']),
|
|---|
| 553 | 'gamma': float(opt['svc_gamma']),
|
|---|
| 554 | 'kernel': opt['svc_kernel']}}
|
|---|
| 555 | classifiers.append(svc)
|
|---|
| 556 |
|
|---|
| 557 | # extract classifiers from pyindx
|
|---|
| 558 | if opt['pyindx']:
|
|---|
| 559 | indexes = [i for i in get_indexes(opt['pyindx'])]
|
|---|
| 560 | classifiers = [classifiers[i] for i in indexes]
|
|---|
| 561 |
|
|---|
| 562 | num = int(opt['n_training']) if opt['n_training'] else None
|
|---|
| 563 |
|
|---|
| 564 | # load fron npy files
|
|---|
| 565 | Xt = np.load(opt['npy_tdata'])
|
|---|
| 566 | Yt = np.load(opt['npy_tclasses'])
|
|---|
| 567 | cols = np.load(opt['npy_cols'])
|
|---|
| 568 |
|
|---|
| 569 | # Define rules to substitute NaN, Inf, posInf, negInf values
|
|---|
| 570 | rules = {}
|
|---|
| 571 | for key in ('nan', 'inf', 'neginf', 'posinf'):
|
|---|
| 572 | if opt[key]:
|
|---|
| 573 | rules[key] = get_rules(opt[key])
|
|---|
| 574 | pprint(rules)
|
|---|
| 575 |
|
|---|
| 576 | # Substitute (skip cat column)
|
|---|
| 577 | Xt, rules_vals = substitute(Xt, rules, cols[1:])
|
|---|
| 578 | Xtoriginal = Xt
|
|---|
| 579 |
|
|---|
| 580 | # scale the data
|
|---|
| 581 | if scaler:
|
|---|
| 582 | msgr.message("Scaling the training data set.")
|
|---|
| 583 | scaler.fit(Xt, Yt)
|
|---|
| 584 | Xt = scaler.transform(Xt)
|
|---|
| 585 |
|
|---|
| 586 | # decompose data
|
|---|
| 587 | if decmp:
|
|---|
| 588 | msgr.message("Decomposing the training data set.")
|
|---|
| 589 | decmp.fit(Xt)
|
|---|
| 590 | Xt = decmp.transform(Xt)
|
|---|
| 591 |
|
|---|
| 592 | # Feature importances with forests of trees
|
|---|
| 593 | if flg['f']:
|
|---|
| 594 | np.save('training_transformed.npy', Xt)
|
|---|
| 595 | importances(Xt, Yt, cols[1:],
|
|---|
| 596 | csv=opt['imp_csv'], img=opt['imp_fig'],
|
|---|
| 597 | # default parameters to save the matplotlib figure
|
|---|
| 598 | **dict(dpi=300, transparent=False, bbox_inches='tight'))
|
|---|
| 599 |
|
|---|
| 600 | # optimize the training set
|
|---|
| 601 | if flg['o']:
|
|---|
| 602 | ind_optimize = (int(opt['pyindx_optimize']) if opt['pyindx_optimize']
|
|---|
| 603 | else 0)
|
|---|
| 604 | cls = classifiers[ind_optimize]
|
|---|
| 605 | msgr.message("Find the optimum training set.")
|
|---|
| 606 | best, Xbt, Ybt = optimize_training(cls, Xt, Yt,
|
|---|
| 607 | labels, #{v: k for k, v in labels.items()},
|
|---|
| 608 | scaler, decmp,
|
|---|
| 609 | num=num, maxiterations=1000)
|
|---|
| 610 | msg = " - save the optimum training data set to: %s."
|
|---|
| 611 | msgr.message(msg % opt['npy_btdata'])
|
|---|
| 612 | np.save(opt['npy_btdata'], Xbt)
|
|---|
| 613 | msg = " - save the optimum training classes set to: %s."
|
|---|
| 614 | msgr.message(msg % opt['npy_btclasses'])
|
|---|
| 615 | np.save(opt['npy_btclasses'], Ybt)
|
|---|
| 616 |
|
|---|
| 617 | # balance the data
|
|---|
| 618 | if flg['b']:
|
|---|
| 619 | msg = "Balancing the training data set, each class have <%d> samples."
|
|---|
| 620 | msgr.message(msg % num)
|
|---|
| 621 | Xbt, Ybt = balance(Xt, Yt, num)
|
|---|
| 622 | else:
|
|---|
| 623 | if not flg['o']:
|
|---|
| 624 | Xbt = (np.load(opt['npy_btdata'])
|
|---|
| 625 | if os.path.isfile(opt['npy_btdata']) else Xt)
|
|---|
| 626 | Ybt = (np.load(opt['npy_btclasses'])
|
|---|
| 627 | if os.path.isfile(opt['npy_btclasses']) else Yt)
|
|---|
| 628 |
|
|---|
| 629 | # scale the data
|
|---|
| 630 | if scaler:
|
|---|
| 631 | msgr.message("Scaling the training data set.")
|
|---|
| 632 | scaler.fit(Xbt, Ybt)
|
|---|
| 633 | Xt = scaler.transform(Xt)
|
|---|
| 634 | Xbt = scaler.transform(Xbt)
|
|---|
| 635 |
|
|---|
| 636 | if flg['d']:
|
|---|
| 637 | C_range = [float(c) for c in opt['svc_c_range'].split(',') if c]
|
|---|
| 638 | gamma_range = [float(g) for g in opt['svc_gamma_range'].split(',') if g]
|
|---|
| 639 | kernel_range = [str(s) for s in opt['svc_kernel_range'].split(',') if s]
|
|---|
| 640 | poly_range = [int(i) for i in opt['svc_poly_range'].split(',') if i]
|
|---|
| 641 | allkwargs = dict(C=C_range, gamma=gamma_range,
|
|---|
| 642 | kernel=kernel_range, degree=poly_range)
|
|---|
| 643 | kwargs = {}
|
|---|
| 644 | for k in allkwargs:
|
|---|
| 645 | if allkwargs[k]:
|
|---|
| 646 | kwargs[k] = allkwargs[k]
|
|---|
| 647 | msgr.message("Exploring the SVC domain.")
|
|---|
| 648 | grid = explore_SVC(Xbt, Ybt, n_folds=5, n_jobs=int(opt['svc_n_jobs']),
|
|---|
| 649 | **kwargs)
|
|---|
| 650 | import pickle
|
|---|
| 651 | krnlstr = '_'.join(s for s in opt['svc_kernel_range'].split(',') if s)
|
|---|
| 652 | pkl = open('grid%s.pkl' % krnlstr, 'w')
|
|---|
| 653 | pickle.dump(grid, pkl)
|
|---|
| 654 | pkl.close()
|
|---|
| 655 | # pkl = open('grid.pkl', 'r')
|
|---|
| 656 | # grid = pickle.load(pkl)
|
|---|
| 657 | # pkl.close()
|
|---|
| 658 | plot_grid(grid, save=opt['svc_img'])
|
|---|
| 659 |
|
|---|
| 660 | # test the accuracy of different classifiers
|
|---|
| 661 | if flg['t']:
|
|---|
| 662 | # test different classifiers
|
|---|
| 663 | msgr.message("Exploring different classifiers.")
|
|---|
| 664 | msgr.message("cls_id cls_name mean max min std")
|
|---|
| 665 |
|
|---|
| 666 | res = explorer_clsfiers(classifiers, Xt, Yt, labels=labels,
|
|---|
| 667 | indexes=indexes, n_folds=5,
|
|---|
| 668 | bv=flg['v'], extra=flg['x'])
|
|---|
| 669 | # TODO: sort(order=...) is working only in the terminal, why?
|
|---|
| 670 | #res.sort(order='mean')
|
|---|
| 671 | with open(opt['csv_test_cls'], 'w') as csv:
|
|---|
| 672 | csv.write(tocsv(res))
|
|---|
| 673 |
|
|---|
| 674 | if flg['c']:
|
|---|
| 675 | # classify
|
|---|
| 676 | data = np.load(opt['npy_data'])
|
|---|
| 677 | indx = np.load(opt['npy_index'])
|
|---|
| 678 |
|
|---|
| 679 | # Substitute using column values
|
|---|
| 680 | data, dummy = substitute(data, rules, cols[1:])
|
|---|
| 681 | Xt = data[indx]
|
|---|
| 682 |
|
|---|
| 683 | if scaler:
|
|---|
| 684 | msgr.message("Scaling the training data set.")
|
|---|
| 685 | scaler.fit(Xt, Yt)
|
|---|
| 686 | Xt = scaler.transform(Xt)
|
|---|
| 687 | msgr.message("Scaling the whole data set.")
|
|---|
| 688 | data = scaler.transform(data)
|
|---|
| 689 | if decmp:
|
|---|
| 690 | msgr.message("Decomposing the training data set.")
|
|---|
| 691 | decmp.fit(Xt)
|
|---|
| 692 | Xt = decmp.transform(Xt)
|
|---|
| 693 | msgr.message("Decompose the whole data set.")
|
|---|
| 694 | data = decmp.transform(data)
|
|---|
| 695 | cats = np.load(opt['npy_cats'])
|
|---|
| 696 |
|
|---|
| 697 | np.save('data_filled_scaled.npy', data)
|
|---|
| 698 | tcols = []
|
|---|
| 699 | for cls in classifiers:
|
|---|
| 700 | report = (open(opt['report_class'], "w")
|
|---|
| 701 | if opt['report_class'] else sys.stdout)
|
|---|
| 702 | run_classifier(cls, Xt, Yt, Xt, Yt, labels, data,
|
|---|
| 703 | report=report)
|
|---|
| 704 | tcols.append((cls['name'], 'INTEGER'))
|
|---|
| 705 |
|
|---|
| 706 | import pickle
|
|---|
| 707 | with open('classification_results.pkl', 'w') as res:
|
|---|
| 708 | pickle.dump(classifiers, res)
|
|---|
| 709 | #classifiers = pickle.load(res)
|
|---|
| 710 | msgr.message("Export the results to layer: <%s>" % str(rlayer))
|
|---|
| 711 | export_results(vect, classifiers, cats, rlayer, vtraining, tcols,
|
|---|
| 712 | overwrite(), pkl='res.pkl', append=flg['a'])
|
|---|
| 713 | # res.close()
|
|---|
| 714 |
|
|---|
| 715 | if flg['r']:
|
|---|
| 716 | rules = ('\n'.join(['%d %s' % (k, v)
|
|---|
| 717 | for k, v in get_colors(vtraining).items()])
|
|---|
| 718 | if vtraining else None)
|
|---|
| 719 |
|
|---|
| 720 | msgr.message("Export the layer with results to raster")
|
|---|
| 721 | with Vector(vect, mode='r') as vct:
|
|---|
| 722 | tab = vct.dblinks.by_name(rlayer).table()
|
|---|
| 723 | rasters = [c for c in tab.columns]
|
|---|
| 724 | rasters.remove(tab.key)
|
|---|
| 725 |
|
|---|
| 726 | v2rst = Module('v.to.rast')
|
|---|
| 727 | rclrs = Module('r.colors')
|
|---|
| 728 | for rst in rasters:
|
|---|
| 729 | v2rst(input=vect, layer=rlayer, type='area',
|
|---|
| 730 | use='attr', attrcolumn=rst.encode(),
|
|---|
| 731 | output=(opt['rst_names'] % rst).encode(),
|
|---|
| 732 | memory=1000, overwrite=overwrite())
|
|---|
| 733 | if rules:
|
|---|
| 734 | rclrs(map=rst.encode(), rules='-', stdin_=rules)
|
|---|
| 735 |
|
|---|
| 736 |
|
|---|
| 737 | if __name__ == "__main__":
|
|---|
| 738 | main(*parser())
|
|---|