source: grass-addons/grass7/vector/v.class.ml/v.class.ml.py

Last change on this file was 73969, checked in by neteler, 6 years ago

vector addons: fix tab/space indentation mess; tabs are to be avoided, see https://trac.osgeo.org/grass/wiki/Submitting/Python?version=19#Editorsettingsfor4-spaceindentation

  • Property svn:eol-style set to native
  • Property svn:executable set to *
  • Property svn:mime-type set to text/x-python
File size: 21.3 KB
Line 
1#!/usr/bin/env python
2# -- coding: utf-8 --
3#
4############################################################################
5#
6# MODULE: v.class.ml
7#
8# AUTHOR(S): Pietro Zambelli (University of Trento)
9#
10# COPYRIGHT: (C) 2013 by the GRASS Development Team
11#
12# This program is free software under the GNU General Public
13# License (>=v2). Read the file COPYING that comes with GRASS
14# for details.
15#
16#############################################################################
17
18#%Module
19#% description: Classification of a vector maps based on the values in attribute tables
20#% keyword: vector
21#% keyword: classification
22#% keyword: machine learning
23#% overwrite: yes
24#%End
25#%option G_OPT_V_MAP
26#% key: vector
27#% description: Name of input vector map
28#% required: yes
29#%end
30#%option G_OPT_V_MAP
31#% key: vtraining
32#% description: Name of training vector map
33#% required: no
34#%end
35#%option
36#% key: vlayer
37#% type: string
38#% multiple: no
39#% description: layer name or number to use for data
40#% required: no
41#%end
42#%option
43#% key: tlayer
44#% type: string
45#% multiple: no
46#% description: layer number/name for the training layer
47#% required: no
48#%end
49#%option
50#% key: rlayer
51#% type: string
52#% multiple: no
53#% description: layer number/name for the ML results
54#% required: no
55#%end
56#%option
57#% key: npy_data
58#% type: string
59#% multiple: no
60#% description: Data with statistics in npy format.
61#% answer: data.npy
62#% required: no
63#%end
64#%option
65#% key: npy_cats
66#% type: string
67#% multiple: no
68#% description: Numpy array with vector cats.
69#% answer: cats.npy
70#% required: no
71#%end
72#%option
73#% key: npy_cols
74#% type: string
75#% multiple: no
76#% description: Numpy array with columns names.
77#% answer: cols.npy
78#% required: no
79#%end
80#%option
81#% key: npy_index
82#% type: string
83#% multiple: no
84#% description: Boolean numpy array with training indexes.
85#% answer: indx.npy
86#% required: no
87#%end
88#%option
89#% key: npy_tdata
90#% type: string
91#% multiple: no
92#% description: training npy file with training set, default: training_data.npy
93#% answer: training_data.npy
94#% required: no
95#%end
96#%option
97#% key: npy_tclasses
98#% type: string
99#% multiple: no
100#% description: training npy file with the classes, default: training_classes.npy
101#% answer: training_classes.npy
102#% required: no
103#%end
104#%option
105#% key: npy_btdata
106#% type: string
107#% multiple: no
108#% description: training npy file with training set, default: training_data.npy
109#% answer: Xbt.npy
110#% required: no
111#%end
112#%option
113#% key: npy_btclasses
114#% type: string
115#% multiple: no
116#% description: training npy file with the classes, default: training_classes.npy
117#% answer: Ybt.npy
118#% required: no
119#%end
120#%option
121#% key: imp_csv
122#% type: string
123#% multiple: no
124#% description: CSV file name with the feature importances rank using extra tree algorithms
125#% answer: features_importances.csv
126#% required: no
127#%end
128#%option
129#% key: imp_fig
130#% type: string
131#% multiple: no
132#% description: Figure file name with feature importances rank using extra tree algorithms
133#% answer: features_importances.png
134#% required: no
135#%end
136#%option
137#% key: scalar
138#% type: string
139#% multiple: yes
140#% description: scaler method, center the data before scaling, if no, not scale at all
141#% required: no
142#% answer: with_mean,with_std
143#%end
144#%option
145#% key: decomposition
146#% type: string
147#% multiple: no
148#% description: choose a decomposition method (PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, FastICA, TruncatedSVD) and set the parameters using the | to separate the decomposition method from the parameters like: PCA|n_components=98
149#% required: no
150#% answer:
151#%end
152#%option
153#% key: n_training
154#% type: integer
155#% multiple: no
156#% description: Number of random training per class to training the machine learning algorithms
157#% required: no
158#%end
159#%option
160#% key: pyclassifiers
161#% type: string
162#% multiple: no
163#% description: a python file with classifiers
164#% required: no
165#%end
166#%option
167#% key: pyvar
168#% type: string
169#% multiple: no
170#% description: name of the python variable that must be a list of dictionary
171#% required: no
172#%end
173#%option
174#% key: pyindx
175#% type: string
176#% multiple: no
177#% description: specify the index or range of index of the classifiers that you want to use
178#% required: no
179#%end
180#%option
181#% key: pyindx_optimize
182#% type: string
183#% multiple: no
184#% description: Index of the classifiers to optimize the training set
185#% required: no
186#%end
187#%option
188#% key: nan
189#% type: string
190#% multiple: yes
191#% description: Column pattern:Value or Numpy funtion to use to substitute NaN values
192#% required: no
193#% answer: *_skewness:nanmean,*_kurtosis:nanmean
194#%end
195#%option
196#% key: inf
197#% type: string
198#% multiple: yes
199#% description: Key:Value or Numpy funtion to use to substitute Inf values
200#% required: no
201#% answer: *_skewness:nanmean,*_kurtosis:nanmean
202#%end
203#%option
204#% key: neginf
205#% type: string
206#% multiple: yes
207#% description: Key:Value or Numpy funtion to use to substitute neginf values
208#% required: no
209#% answer:
210#%end
211#%option
212#% key: posinf
213#% type: string
214#% multiple: yes
215#% description: Key:Value or Numpy funtion to use to substitute posinf values
216#% required: no
217#% answer:
218#%end
219#%option
220#% key: csv_test_cls
221#% type: string
222#% multiple: no
223#% description: csv file name with results of different machine learning scores
224#% required: no
225#% answer: test_classifiers.csv
226#%end
227#%option
228#% key: report_class
229#% type: string
230#% multiple: no
231#% description: text file name with the report of different machine learning algorithms
232#% required: no
233#% answer: classification_report.txt
234#%end
235#%option
236#% key: svc_c_range
237#% type: double
238#% multiple: yes
239#% description: C value range list to explore SVC domain
240#% required: no
241#% answer: 1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5,1e6,1e7,1e8
242#%end
243#%option
244#% key: svc_gamma_range
245#% type: double
246#% multiple: yes
247#% description: gamma value range list to explore SVC domain
248#% required: no
249#% answer: 1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4
250#%end
251#%option
252#% key: svc_kernel_range
253#% type: string
254#% multiple: yes
255#% description: kernel value range list to explore SVC domain
256#% required: no
257#% answer: linear,poly,rbf,sigmoid
258#%end
259#%option
260#% key: svc_poly_range
261#% type: string
262#% multiple: yes
263#% description: polynomial order list to explore SVC domain
264#% required: no
265#% answer:
266#%end
267#%option
268#% key: svc_n_jobs
269#% type: integer
270#% multiple: no
271#% description: number of jobs to use during the domain exploration
272#% required: no
273#% answer: 1
274#%end
275#%option
276#% key: svc_c
277#% type: double
278#% multiple: no
279#% description: definitive C value
280#% required: no
281#%end
282#%option
283#% key: svc_gamma
284#% type: double
285#% multiple: no
286#% description: definitive gamma value
287#% required: no
288#%end
289#%option
290#% key: svc_kernel
291#% type: string
292#% multiple: no
293#% description: definitive kernel value. Available kernel are: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
294#% required: no
295#% answer: rbf
296#%end
297#%option
298#% key: svc_img
299#% type: string
300#% multiple: no
301#% description: filename pattern with the image of SVC parameter
302#% required: no
303#% answer: domain_%s.svg
304#%end
305#%option
306#% key: rst_names
307#% type: string
308#% multiple: no
309#% description: filename pattern for raster
310#% required: no
311#% answer: %s
312#%end
313#-----------------------------------------------------
314#%flag
315#% key: e
316#% description: Extract the training set from the vtraining map
317#%end
318#%flag
319#% key: n
320#% description: Export to numpy files
321#%end
322#%flag
323#% key: f
324#% description: Feature importances using extra trees algorithm
325#%end
326#%flag
327#% key: b
328#% description: Balance the training using the class with the minor number of data
329#%end
330#%flag
331#% key: o
332#% description: Optimize the training samples
333#%end
334#%flag
335#% key: c
336#% description: Classify the whole dataset
337#%end
338#%flag
339#% key: r
340#% description: Export the classify results to raster maps
341#%end
342#%flag
343#% key: t
344#% description: Test different classification methods
345#%end
346#%flag
347#% key: v
348#% description: add to test to compute the Bias variance
349#%end
350#%flag
351#% key: x
352#% description: add to test to compute extra parameters like: confusion matrix, ROC, PR
353#%end
354#%flag
355#% key: d
356#% description: Explore the SVC domain
357#%end
358#%flag
359#% key: a
360#% description: append the classification results
361#%end
362#-----------------------------------------------------
363from __future__ import (absolute_import, division, print_function,
364 unicode_literals)
365import imp
366import sys
367import os
368from pprint import pprint
369from fnmatch import fnmatch
370
371import numpy as np
372
373from grass.pygrass.utils import set_path
374from grass.pygrass.messages import get_msgr
375from grass.pygrass.vector import Vector
376from grass.pygrass.modules import Module
377from grass.script.core import parser, overwrite
378
379set_path('v.class.ml')
380
381from training_extraction import extract_training
382from sqlite2npy import save2npy
383from npy2table import export_results
384
385
386DECMP = {}
387
388
389def load_decompositions():
390 """Import decompositions and update dictionary which stores them"""
391 from sklearn.decomposition import (PCA, KernelPCA, ProbabilisticPCA,
392 RandomizedPCA, FastICA, TruncatedSVD)
393 from sklearn.lda import LDA
394 DECMP.update({
395 'PCA': PCA,
396 'KernelPCA': KernelPCA,
397 'ProbabilisticPCA': ProbabilisticPCA,
398 'RandomizedPCA': RandomizedPCA,
399 'FastICA': FastICA,
400 'TruncatedSVD': TruncatedSVD,
401 'LDA': LDA
402 })
403
404
405def get_indexes(string, sep=',', rangesep='-'):
406 """
407 >>> indx = '1-5,34-36,40'
408 >>> [i for i in get_indexes(indx)]
409 [1, 2, 3, 4, 5, 34, 35, 36, 40]
410 """
411 for ind in string.split(sep):
412 if rangesep in ind:
413 start, stop = ind.split(rangesep)
414 for i in range(int(start), int(stop) + 1):
415 yield i
416 else:
417 yield int(ind)
418
419
420def get_colors(vtraining):
421 vect, mset = vtraining.split('@') if '@' in vtraining else (vtraining, '')
422 with Vector(vect, mapset=mset, mode='r') as vct:
423 cur = vct.table.execute('SELECT cat, color FROM %s;' % vct.name)
424 return dict([c for c in cur.fetchall()])
425
426
427def convert(string):
428 try:
429 return float(string)
430 except:
431 try:
432 return getattr(np, string)
433 except AttributeError:
434 msg = "Not a valid option, is not a number or a numpy function."
435 raise TypeError(msg)
436
437
438def get_rules(string):
439 res = {}
440 pairs = [s.strip().split(':') for s in string.strip().split(',')]
441 for key, val in pairs:
442 res[key] = convert(val)
443 return res
444
445
446def find_special_cols(array, cols, report=True,
447 special=('nan', 'inf', 'neginf', 'posinf')):
448 sp = {key: [] for key in special}
449 cntr = {key: [] for key in special}
450 for i in range(len(cols)):
451 for key in special:
452 barray = getattr(np, 'is%s' % key)(array[:, i])
453 if barray.any():
454 sp[key].append(i)
455 cntr[key].append(barray.sum())
456 if report:
457 indent = ' '
458 tot = len(array)
459 for k in special:
460 fmt = '- %15s (%3d/%d, %4.3f%%)'
461 if sp[k]:
462 strs = [fmt % (col, cnt, tot, cnt/float(tot)*100)
463 for col, cnt in zip(cols[np.array(sp[k])], cntr[k])]
464 print('%s:\n%s' % (k, indent), ('\n%s' % indent).join(strs),
465 sep='')
466 return sp
467
468
469def substitute(X, rules, cols):
470 vals = {}
471 special_cols = find_special_cols(X, cols)
472 pprint(special_cols)
473 for key in rules.keys():
474 vals[key] = {}
475 for i in special_cols[key]:
476 for rule in rules[key]:
477 if fnmatch(cols[i], rule):
478 indx = getattr(np, 'is%s' % key)(X[:, i])
479 val = (rules[key][rule] if np.isscalar(rules[key][rule])
480 else rules[key][rule](X[:, i][~indx]))
481 X[:, i][indx] = val
482 vals[key][cols[i]] = val
483 return X, vals
484
485
486def extract_classes(vect, layer):
487 vect, mset = vect.split('@') if '@'in vect else (vect, '')
488 with Vector(vect, mapset=mset, layer=layer, mode='r') as vct:
489 vct.table.filters.select('cat', 'class')
490 return {key: val for key, val in vct.table.execute()}
491
492
493def main(opt, flg):
494 # import functions which depend on sklearn only after parser run
495 from ml_functions import (balance, explorer_clsfiers, run_classifier,
496 optimize_training, explore_SVC, plot_grid)
497 from features import importances, tocsv
498
499 msgr = get_msgr()
500 indexes = None
501 vect = opt['vector']
502 vtraining = opt['vtraining'] if opt['vtraining'] else None
503 scaler, decmp = None, None
504 vlayer = opt['vlayer'] if opt['vlayer'] else vect + '_stats'
505 tlayer = opt['tlayer'] if opt['tlayer'] else vect + '_training'
506 rlayer = opt['rlayer'] if opt['rlayer'] else vect + '_results'
507
508 labels = extract_classes(vtraining, 1)
509 pprint(labels)
510
511 if opt['scalar']:
512 scapar = opt['scalar'].split(',')
513 from sklearn.preprocessing import StandardScaler
514 scaler = StandardScaler(with_mean='with_mean' in scapar,
515 with_std='with_std' in scapar)
516
517 if opt['decomposition']:
518 dec, params = (opt['decomposition'].split('|')
519 if '|' in opt['decomposition']
520 else (opt['decomposition'], ''))
521 kwargs = ({k: v for k, v in (p.split('=') for p in params.split(','))}
522 if params else {})
523 load_decompositions()
524 decmp = DECMP[dec](**kwargs)
525
526 # if training extract training
527 if vtraining and flg['e']:
528 msgr.message("Extract training from: <%s> to <%s>." % (vtraining, vect))
529 extract_training(vect, vtraining, tlayer)
530 flg['n'] = True
531
532 if flg['n']:
533 msgr.message("Save arrays to npy files.")
534 save2npy(vect, vlayer, tlayer,
535 fcats=opt['npy_cats'], fcols=opt['npy_cols'],
536 fdata=opt['npy_data'], findx=opt['npy_index'],
537 fclss=opt['npy_tclasses'], ftdata=opt['npy_tdata'])
538
539 # define the classifiers to use/test
540 if opt['pyclassifiers'] and opt['pyvar']:
541 # import classifiers to use
542 mycls = imp.load_source("mycls", opt['pyclassifiers'])
543 classifiers = getattr(mycls, opt['pyvar'])
544 else:
545 from ml_classifiers import CLASSIFIERS
546 classifiers = CLASSIFIERS
547
548 # Append the SVC classifier
549 if opt['svc_c'] and opt['svc_gamma']:
550 from sklearn.svm import SVC
551 svc = {'name': 'SVC', 'classifier': SVC,
552 'kwargs': {'C': float(opt['svc_c']),
553 'gamma': float(opt['svc_gamma']),
554 'kernel': opt['svc_kernel']}}
555 classifiers.append(svc)
556
557 # extract classifiers from pyindx
558 if opt['pyindx']:
559 indexes = [i for i in get_indexes(opt['pyindx'])]
560 classifiers = [classifiers[i] for i in indexes]
561
562 num = int(opt['n_training']) if opt['n_training'] else None
563
564 # load fron npy files
565 Xt = np.load(opt['npy_tdata'])
566 Yt = np.load(opt['npy_tclasses'])
567 cols = np.load(opt['npy_cols'])
568
569 # Define rules to substitute NaN, Inf, posInf, negInf values
570 rules = {}
571 for key in ('nan', 'inf', 'neginf', 'posinf'):
572 if opt[key]:
573 rules[key] = get_rules(opt[key])
574 pprint(rules)
575
576 # Substitute (skip cat column)
577 Xt, rules_vals = substitute(Xt, rules, cols[1:])
578 Xtoriginal = Xt
579
580 # scale the data
581 if scaler:
582 msgr.message("Scaling the training data set.")
583 scaler.fit(Xt, Yt)
584 Xt = scaler.transform(Xt)
585
586 # decompose data
587 if decmp:
588 msgr.message("Decomposing the training data set.")
589 decmp.fit(Xt)
590 Xt = decmp.transform(Xt)
591
592 # Feature importances with forests of trees
593 if flg['f']:
594 np.save('training_transformed.npy', Xt)
595 importances(Xt, Yt, cols[1:],
596 csv=opt['imp_csv'], img=opt['imp_fig'],
597 # default parameters to save the matplotlib figure
598 **dict(dpi=300, transparent=False, bbox_inches='tight'))
599
600 # optimize the training set
601 if flg['o']:
602 ind_optimize = (int(opt['pyindx_optimize']) if opt['pyindx_optimize']
603 else 0)
604 cls = classifiers[ind_optimize]
605 msgr.message("Find the optimum training set.")
606 best, Xbt, Ybt = optimize_training(cls, Xt, Yt,
607 labels, #{v: k for k, v in labels.items()},
608 scaler, decmp,
609 num=num, maxiterations=1000)
610 msg = " - save the optimum training data set to: %s."
611 msgr.message(msg % opt['npy_btdata'])
612 np.save(opt['npy_btdata'], Xbt)
613 msg = " - save the optimum training classes set to: %s."
614 msgr.message(msg % opt['npy_btclasses'])
615 np.save(opt['npy_btclasses'], Ybt)
616
617 # balance the data
618 if flg['b']:
619 msg = "Balancing the training data set, each class have <%d> samples."
620 msgr.message(msg % num)
621 Xbt, Ybt = balance(Xt, Yt, num)
622 else:
623 if not flg['o']:
624 Xbt = (np.load(opt['npy_btdata'])
625 if os.path.isfile(opt['npy_btdata']) else Xt)
626 Ybt = (np.load(opt['npy_btclasses'])
627 if os.path.isfile(opt['npy_btclasses']) else Yt)
628
629 # scale the data
630 if scaler:
631 msgr.message("Scaling the training data set.")
632 scaler.fit(Xbt, Ybt)
633 Xt = scaler.transform(Xt)
634 Xbt = scaler.transform(Xbt)
635
636 if flg['d']:
637 C_range = [float(c) for c in opt['svc_c_range'].split(',') if c]
638 gamma_range = [float(g) for g in opt['svc_gamma_range'].split(',') if g]
639 kernel_range = [str(s) for s in opt['svc_kernel_range'].split(',') if s]
640 poly_range = [int(i) for i in opt['svc_poly_range'].split(',') if i]
641 allkwargs = dict(C=C_range, gamma=gamma_range,
642 kernel=kernel_range, degree=poly_range)
643 kwargs = {}
644 for k in allkwargs:
645 if allkwargs[k]:
646 kwargs[k] = allkwargs[k]
647 msgr.message("Exploring the SVC domain.")
648 grid = explore_SVC(Xbt, Ybt, n_folds=5, n_jobs=int(opt['svc_n_jobs']),
649 **kwargs)
650 import pickle
651 krnlstr = '_'.join(s for s in opt['svc_kernel_range'].split(',') if s)
652 pkl = open('grid%s.pkl' % krnlstr, 'w')
653 pickle.dump(grid, pkl)
654 pkl.close()
655# pkl = open('grid.pkl', 'r')
656# grid = pickle.load(pkl)
657# pkl.close()
658 plot_grid(grid, save=opt['svc_img'])
659
660 # test the accuracy of different classifiers
661 if flg['t']:
662 # test different classifiers
663 msgr.message("Exploring different classifiers.")
664 msgr.message("cls_id cls_name mean max min std")
665
666 res = explorer_clsfiers(classifiers, Xt, Yt, labels=labels,
667 indexes=indexes, n_folds=5,
668 bv=flg['v'], extra=flg['x'])
669 # TODO: sort(order=...) is working only in the terminal, why?
670 #res.sort(order='mean')
671 with open(opt['csv_test_cls'], 'w') as csv:
672 csv.write(tocsv(res))
673
674 if flg['c']:
675 # classify
676 data = np.load(opt['npy_data'])
677 indx = np.load(opt['npy_index'])
678
679 # Substitute using column values
680 data, dummy = substitute(data, rules, cols[1:])
681 Xt = data[indx]
682
683 if scaler:
684 msgr.message("Scaling the training data set.")
685 scaler.fit(Xt, Yt)
686 Xt = scaler.transform(Xt)
687 msgr.message("Scaling the whole data set.")
688 data = scaler.transform(data)
689 if decmp:
690 msgr.message("Decomposing the training data set.")
691 decmp.fit(Xt)
692 Xt = decmp.transform(Xt)
693 msgr.message("Decompose the whole data set.")
694 data = decmp.transform(data)
695 cats = np.load(opt['npy_cats'])
696
697 np.save('data_filled_scaled.npy', data)
698 tcols = []
699 for cls in classifiers:
700 report = (open(opt['report_class'], "w")
701 if opt['report_class'] else sys.stdout)
702 run_classifier(cls, Xt, Yt, Xt, Yt, labels, data,
703 report=report)
704 tcols.append((cls['name'], 'INTEGER'))
705
706 import pickle
707 with open('classification_results.pkl', 'w') as res:
708 pickle.dump(classifiers, res)
709 #classifiers = pickle.load(res)
710 msgr.message("Export the results to layer: <%s>" % str(rlayer))
711 export_results(vect, classifiers, cats, rlayer, vtraining, tcols,
712 overwrite(), pkl='res.pkl', append=flg['a'])
713# res.close()
714
715 if flg['r']:
716 rules = ('\n'.join(['%d %s' % (k, v)
717 for k, v in get_colors(vtraining).items()])
718 if vtraining else None)
719
720 msgr.message("Export the layer with results to raster")
721 with Vector(vect, mode='r') as vct:
722 tab = vct.dblinks.by_name(rlayer).table()
723 rasters = [c for c in tab.columns]
724 rasters.remove(tab.key)
725
726 v2rst = Module('v.to.rast')
727 rclrs = Module('r.colors')
728 for rst in rasters:
729 v2rst(input=vect, layer=rlayer, type='area',
730 use='attr', attrcolumn=rst.encode(),
731 output=(opt['rst_names'] % rst).encode(),
732 memory=1000, overwrite=overwrite())
733 if rules:
734 rclrs(map=rst.encode(), rules='-', stdin_=rules)
735
736
737if __name__ == "__main__":
738 main(*parser())
Note: See TracBrowser for help on using the repository browser.