24
 ! "#$%&'() %* +#,,%&$ -./$%& 0.1&.22'%* !).3 45 +6%)( (*7 8.&*9(& 7 +/9 : %);%,< +.,$.6=.& >?@ A??> !"#$%&'$ B* $9'2 $#$%&'() C. 1'D. (* %D.&D'.C %< $9. =(2'/ '7.(2 #*7.&E )F'*1 +#,,%&$ -./$%& G+-H 6(/9'*.2 <%& <#*/$'%* .2$'6($'%*5 I#&$9.&6%&.@ C. '*/)#7. ( 2#66(&F %< /#&&.*$)F #2.7 ()1%E &'$962 <%& $&('*'*1 +- 6(/9'*.2@ /%D.&'*1 =%$9 $9. J#(7&($'/ G%& /%*D.3H ,&%1&(66'*1 ,(&$ (*7 (7D(*/.7 6.$9%72 <%& 7.()'*1 C'$9 )(&1. 7($(2.$25 I'*())F @ C. 6.*$'%* 2%6. 6%7' !E /($'%*2 (*7 .3$.*2'%*2 $9($ 9(D. =..* (,,)'.7 $% $9. 2$(*7(&7 +- ()1%&'$96@ (*7 7'2/#22 $9. (2,./$ %< &.1#)(&'K($'%* <&%6 ( +- ,.&2,./$'D.5 ( )*$%+,-'$ .+* "9. ,#&,%2. %< $9'2 ,(,.& '2 $C%<%)75 B$ 29%#)7 2.&D. (2 ( 2.)<E /%*$('*.7 '*$&%7#/$'%* $% +#,,%&$ -./$%& &.1&.22'%* <%& &.(7E .&2 *.C $% $9'2 &(,'7)F 7.D.)%,'*1  !.)7 %< &.2.(&/95 L M* $9. %$9.& 9(*7@ '$ ($$.6,$2 $% 1'D. (* %D.&D'.C %< &./.*$ 7.D.)%,E 6.*$2 '* $9.  !.)75 "% $9'2 .*7@ C. 7./'7.7 $% %&1(*'K. $9. .22(F (2 <%))%C25 N. 2$(&$ =F 1'D'*1 ( =&'.< %D.&D'.C %< $9. =(2'/ $./9*'J#.2 '* 2./$'%*2 L@ A@ (*7 >@ ,)#2 ( 29%&$ 2#66(&F C'$9 ( *#6=.& %< !1#& .2 (*77'(1& (62 '* 2./ $'%* O5 +./$'%* P &.D '.C2 /#&&. *$ ()E 1%&'$96'/ $./9*'J#.2 #2.7 <%& (/$#())F '6,).6.*$'*1 +- 6(E /9'*.25 "9'2 6(F =. %< 6%2 $ '*$.& .2$ <%& ,&( /$'$'%*.&25 "9. <%))%C'*1 2./$'%* /%D.&2 6%&. (7D(*/.7 $%,'/2 2#/9 (2 .3$.*E 2'%*2 %< $9. =(2'/ +- ()1%&'$96@ /%**./$'%*2 =.$C..* +- 6(E /9'*.2 (*7 &.1#)(&'K($'%* (*7 =&'."F 6.*$'%*2 6.$9%72 <%& /(&&F'*1 %#$ 6%7.) 2.)./$'%*5 N. /%*/)#7. C'$9 ( 7'2/#22'%* %< %,.* J#.2$'%*2 (*7 ,&%=).62 (*7 /#&&.*$ 7'&./$'%*2 %< +- &.2. (&/95 Q%2$ %< $9. &.2#) $2 ,&.2 .*$.7 '* $9'2 &.D' .C ,(,.& ()E &.(7F 9(D. =..* ,#=)'29.7 .)2.C9.&.@ =#$ $9. /%6,&.9.*2'D. ,&.2.*$($'%*2 (*7 2%6. 7.$(')2 (&. *.C5 (/( 0.#$+%.' 1&' 23%+ -*, "9. +- ()1%& '$96 '2 ( *%*)' *.(& 1.*.&() 'K($'%* %< $9.  !"#"$% &'()"* +,$-$&(-  ()1%&'$96 7.D.)%,.7 '* 0#22'( '* $9. 2'3$'.2 A R-(,*'; (*7 S.&*.&@ LTU>@ -(,*'; (*7 V9.&D%*.*;'2@ LTUOW5 !* .3$.*7.7 D.&2' %* %< $9'2 ,(,.& '2 (D(')(=). (2 X.#&%VMS""./9 *'/() 0.E ,%&$ "0ETYE?>?5 0+B +Z@ !#2 $&( )'( * X($ '%*() [*' D.& 2'$ F @ V(*=.&&(@ ?A? ?@ !#2 $&( )'( \ !"#$%&'(")*)+,%#-,%), Q(3E])(*/;EB*2$'$#$ <: #& ='%)%1'2/ 9. ^F=.& *.$';@ _A?_ U " : #='*1.*@ `.&6(*F @ .#/+0)/-%&10(#"2(34*5,#67+8#+%'38%-# L M#& #2. %< $9. $.&6 a&.1&.22'%*a '2 2%6.C9($ )%2. '* $9($ '$ ()2% '*/)#7.2 /(2.2 %< <#*/$'%* .2$'6($'%* C9.&. %*. 6'*'6'K.2 .&&%&2 %$9.& $9(* $9. 6.(* 2J#(&. )%225 "9'2 '2 7%*. 6('*)F <%& 9'2$%&'/() &.(2%*2 R-(,*'; .$ ()5@ LTT_W5 A ! 2'6')(& (,,&%(/9@ 9%C.D.& #2'*1 )'*.(& '*2$.(7 %< J#(7&($'/ ,&%1&(6E 6'*1@ C(2 $(;.* ($ $9. 2(6. $'6. '* $9. [+!@ 6('*)F =F Q(*1(2(&'(* RLTUP@ LTUY@ LTUTW5 !2 2#/9@ '$ '2  !&6)F 1&%#*7.7 '* $9. <&(6.C%&; %< 2$($'2$'/() ).(&*'*1 $9.%&F@ %& ./ -0",$1@ C9'/9 9(2 =..* 7.D.)%,.7 %D.& $9. )(2$ $9&.. 7./(7.2 =F -(,*'; (*7 V9.&D%*.*;'2 RLT_OW@ - (,*'; RLTYA @ LTTPW5 B* ( *#$29 .))@ -V $9.%&F /9(&(/$. &'K.2 ,&%,.&$'.2 %< ).(&*'*1 6(/9'*.2 C9'/9 .*(=). $9.6 $% 1.*.&E ()'K. C.)) $% #*2..* 7($(5 B* '$2 ,& .2.*$ <%& 6@ $9. +- 6(/ 9'* . C(2 )(&1. )F 7.D .)E %,.7 ($ !"b" 8.)) S(=%&($%&'.2 =F -(,*'; (*7 /%EC%&;.&2 R8%2. & .$ ()5@ LTTA@ `#F %* .$ ()5@ LTT>@ V% &$.2 (*7 - (,*'; @ LTTP@ +/9: %);%,< .$ ()5@ LTTP@ +/9: %);%,< .$ ()5@ LTTU@ -(,*'; .$ ()5@ LTT_W5 c#. $% $9'2 '*7#2$&' () /%*$.3$ @ +- &.2. (&/9 9(2 #, $% 7($. 9(7 ( 2%#*7 %&'.* $($'%* $%C(&7 2 &.() EC%&) 7 (,,)' /($'%* 25 B*'$'() C%&; <%/#2.7 %* MV0 G%,$'/() /9(&(/$.& &./%1*'$'%*H5 N'$9'* ( 29%&$ ,.&'%7 %< $'6.@ +- /)(22'!.&2 =./(6. /%6,.$'E $'D. C'$9 $9. =.2$ (D(')(=). 2F2$.62 <%& =%$9 MV0 (*7 %=d./$ &./%1*'$'%* $(2;2 R+/9: %);%,< .$ ()5@ LTTU@ LTTY(@ 8)(*K .$ ()5@ LTTU@ +/9: %);%,<@ LTT_W5 ! /%6,&.9.*2'D. $#$%&'() %* +- /)(2E 2'!.&2 9(2 =..* ,#=) '29.7 =F 8#& 1.2 RLTT YW5 8#$ ()2% '* &.E 1&.2 2'%* (*7 $'6. 2.&'. 2 ,&.7 '/$'%*(,,)'/($'%* 2@ .3/.) ).*$ ,.&E <%&6(*/.2 C.&. 2%%* %=$('*.7 RQ: #)).& .$ ()5@ LTT_@ c&#/;.& .$ ()5@ LTT_@ +$'$2%* .$ ()5@ LTTT@ Q($$.&( (*7 e(F;'* @ LTTTW5 ! 2*(,29%$ %< $9. 2$($. %< $9. (&$ '* +- ).(&*'*1 C(2 &./.*$)F $(;.* ($ $9. (**#() 2"3$&' 4#5,$6&-(,# +$,7"88(#9 :18-"68 /%*E <.&.*/. R+/9: %);%,< .$ ()5@ LTTT(W5 +- ).(&*'*1 9(2 *%C .D%)D.7 '*$% (* (/$'D. (&.( %< &.2.(&/95 Q%&.%D.&@ '$ '2 '* $9. ,&%/.22 %< .*$.&'*1 $9. 2$(*7(&7 6.$9%72 $%%)=%3 %< 6(/9'*. ).(&*E '*1 Re(F;'*@ LTTY@ V9.&;(22;F (*7 Q#)'.&@ LTTY@ e.(&2$ .$ ()5@ LTTYW5 R+/9: %);%,< (*7 +6%)(@ A??AW /%*$('*2 ( 6%&. '*E7.,$9 %D.&D'.C %< +-Q &.1&.22'%*5 !77'$'%*())F @ RV&'2$'(*'*' (*7 +9(C. E" (F)%&@ A???@ e.&= &'/9@ A??AW ,&%D '7. <#&$9 .& 7.$(' )2 %* ;.&*.)2 '* $9. /%*$.3$ %< /)(22' !/($'%*5 (/4 567 1&#.' ),7 & +#,,%2. C. (&. 1'D.* $&('*'*1 7($( {(x1, y1),..., (x , y )} ⊂ X × R@ C9.&.  X  7.*%$.2 $9. 2,(/. %< $9. '*,#$ ,($$.&*2 G.515 X  =  R d H5 "9.2 . 6'19$ =.@ <%& '*2$(* /.@ .3/9(*1. &( $.2 <%& 2%6. /#&&.*/F 6.(2#&.7 ($ 2#=2.J#.*$ 7(F2 $%1.$9.& C'$9 /%&&. 2,%*7 '*1 ./% *%6.$& '/ '*7'/ ($%&25 B*  εE+- &.1& .22'% * R-(,*';@ LTTPW@ %#& 1%() '2 $% !*7 ( <#*/$'%* f (x) $9($ 9(2 ($ 6%2$  ε  7.D'($'%* <&%6 $9. (/$#())F %=$('*.7 $(&1.$2  yi  <%& ()) $9. $&('*'*1 7($(@ (*7 ($ $9. 2(6. $'6. '2 (2 "($ (2 ,%22'=).5 B* %$9.& C%&72@ C. 7% *%$ /(&. (=%#$ .&&%&2 (2 )%*1 (2 $9.F (&. ).22 $9(* ε@ =#$ C')) *%$ (//.,$ (*F 7.D'($'%* )(&1.& $9(* $9'25 "9'2 6(F =. '6,%&$(*$ '< F%# C(*$ $% =. 2#&. *%$ $% )%2. 6%&. $9(* ε 6%*.F C9.* 7.()'*1 C'$9 .3/9(*1. &($.2@ <%& '*2$(*/.5 I%& ,.7(1 %1'/() & .(2%*2@ C. =.1'* =F 7. 2/&'= '*1 $9. /(2. %< )'*.(& <#*/$'%*2 f @ $(;'*1 $9. <%&6 f (x) = w, x+ b C'$9 w X, b R  GLH L

svm tutorial 2003.pdf

  • Upload
    mhdella

  • View
    220

  • Download
    0

Embed Size (px)

Citation preview

  • A Tutorial on Support Vector Regression

    Alex J. Smolaand Bernhard Scholkopf

    September 30, 2003

    Abstract

    In this tutorial we give an overview of the basic ideas under-lying Support Vector (SV) machines for function estimation.Furthermore, we include a summary of currently used algo-rithms for training SV machines, covering both the quadratic(or convex) programming part and advanced methods fordealing with large datasets. Finally, we mention somemodi-cations and extensions that have been applied to the standardSV algorithm, and discuss the aspect of regularization from aSV perspective.

    1 Introduction

    The purpose of this paper is twofold. It should serve as a self-contained introduction to Support Vector regression for read-ers new to this rapidly developing eld of research.1 On theother hand, it attempts to give an overview of recent develop-ments in the eld.To this end, we decided to organize the essay as follows.

    We start by giving a brief overview of the basic techniques insections 1, 2, and 3, plus a short summary with a number ofgures and diagrams in section 4. Section 5 reviews current al-gorithmic techniques used for actually implementing SV ma-chines. This may be of most interest for practitioners. Thefollowing section covers more advanced topics such as exten-sions of the basic SV algorithm, connections between SV ma-chines and regularization and briey mentions methods forcarrying out model selection. We conclude with a discussionof open questions and problems and current directions of SVresearch. Most of the results presented in this review paper al-ready have been published elsewhere, but the comprehensivepresentations and some details are new.

    1.1 Historic Background

    The SV algorithm is a nonlinear generalization of the Gener-alized Portrait algorithm developed in Russia in the sixties2

    [Vapnik and Lerner, 1963, Vapnik and Chervonenkis, 1964].

    An extended version of this paper is available as NeuroCOLT Technical Re-port TR-98-030.

    RSISE, Australian National University, Canberra, 0200, Australia;[email protected]

    Max-Planck-Institut fur biologische Kybernetik, 72076 Tubingen, Germany,[email protected]

    1Our use of the term regression is somewhat lose in that it also includescases of function estimation where one minimizes errors other than the meansquare loss. This is done mainly for historical reasons [Vapnik et al., 1997].

    2A similar approach, however using linear instead of quadratic program-ming, was taken at the same time in the USA, mainly by Mangasarian [1965,1968, 1969].

    As such, it is rmly grounded in the framework of statisticallearning theory, or VC theory, which has been developed overthe last three decades by Vapnik and Chervonenkis [1974],Vapnik [1982, 1995]. In a nutshell, VC theory characterizesproperties of learning machines which enable them to gener-alize well to unseen data.In its present form, the SV machine was largely devel-

    oped at AT&T Bell Laboratories by Vapnik and co-workers[Boser et al., 1992, Guyon et al., 1993, Cortes and Vapnik, 1995,Scholkopf et al., 1995, Scholkopf et al., 1996, Vapnik et al.,1997]. Due to this industrial context, SV research has up todate had a sound orientation towards real-world applications.Initial work focused on OCR (optical character recognition).Within a short period of time, SV classiers became competi-tive with the best available systems for both OCR and objectrecognition tasks [Scholkopf et al., 1996, 1998a, Blanz et al.,1996, Scholkopf, 1997]. A comprehensive tutorial on SV clas-siers has been published by Burges [1998]. But also in re-gression and time series prediction applications, excellent per-formances were soon obtained [Muller et al., 1997, Druckeret al., 1997, Stitson et al., 1999, Mattera and Haykin, 1999]. Asnapshot of the state of the art in SV learning was recentlytaken at the annual Neural Information Processing Systems con-ference [Scholkopf et al., 1999a]. SV learning has now evolvedinto an active area of research. Moreover, it is in the processof entering the standard methods toolbox of machine learn-ing [Haykin, 1998, Cherkassky and Mulier, 1998, Hearst et al.,1998]. [Scholkopf and Smola, 2002] contains a more in-depthoverview of SVM regression. Additionally, [Cristianini andShawe-Taylor, 2000, Herbrich, 2002] provide further details onkernels in the context of classication.

    1.2 The Basic Idea

    Suppose we are given training data {(x1, y1), . . . , (x, y)} X R, where X denotes the space of the input patterns (e.g.X = Rd). These might be, for instance, exchange rates forsome currency measured at subsequent days together withcorresponding econometric indicators. In -SV regression[Vapnik, 1995], our goal is to nd a function f(x) that has atmost deviation from the actually obtained targets yi for allthe training data, and at the same time is as at as possible. Inother words, we do not care about errors as long as they areless than , but will not accept any deviation larger than this.This may be important if you want to be sure not to lose morethan money when dealing with exchange rates, for instance.For pedagogical reasons, we begin by describing the case of

    linear functions f , taking the form

    f(x) = w, x+ bwith w X, b R (1)

    1

  • where , denotes the dot product in X. Flatness in the caseof (1) means that one seeks a small w. One way to ensure thisis to minimize the norm,3 i.e. w2 = w,w. We can writethis problem as a convex optimization problem:

    minimize 12w2

    subject to

    yi w, xi b w, xi+ b yi

    (2)

    The tacit assumption in (2) was that such a function f actuallyexists that approximates all pairs (xi, yi) with precision, orin other words, that the convex optimization problem is feasi-ble. Sometimes, however, this may not be the case, or we alsomay want to allow for some errors. Analogously to the softmargin loss function [Bennett and Mangasarian, 1992] whichwas adapted to SVmachines by Cortes and Vapnik [1995], onecan introduce slack variables i, i to cope with otherwise in-feasible constraints of the optimization problem (2). Hence wearrive at the formulation stated in [Vapnik, 1995].

    minimize 12w2 + CP

    i=1(i +

    i )

    subject to

    8 0 determines the trade-off between the at-ness of f and the amount up to which deviations larger than are tolerated. This corresponds to dealing with a so calledinsensitive loss function || described by

    || :=

    0 if || || otherwise. (4)

    Fig. 1 depicts the situation graphically. Only the points out-side the shaded region contribute to the cost insofar, as thedeviations are penalized in a linear fashion. It turns out that

    x

    x

    xx

    x

    xxx

    x

    xx

    x

    x

    x

    HH

    x

    ]HH

    ]

    Figure 1: The soft margin loss setting for a linear SVM.

    inmost cases the optimization problem (3) can be solvedmoreeasily in its dual formulation.4 Moreover, as we will see inSec. 2, the dual formulation provides the key for extendingSV machine to nonlinear functions. Hence we will use a stan-dard dualization method utilizing Lagrange multipliers, asdescribed in e.g. [Fletcher, 1989].

    3See [Smola, 1998] for an overview over other ways of specifying atness ofsuch functions.

    4This is true as long as the dimensionality of w is much higher than thenumber of observations. If this is not the case, specialized methods can offerconsiderable computational savings [Lee and Mangasarian, 2001].

    1.3 Dual Problem and Quadratic Programms

    The key idea is to construct a Lagrange function from the ob-jective function (it will be called the primal objective functionin the rest of this article) and the corresponding constraints,by introducing a dual set of variables. It can be shown thatthis function has a saddle point with respect to the primal anddual variables at the solution. For details see e.g. [Mangasar-ian, 1969, McCormick, 1983, Vanderbei, 1997] and the expla-nations in section 5.2. We proceed as follows:

    L :=12w2 + C

    Xi=1

    (i + i )

    Xi=1

    (ii + i i ) (5)

    X

    i=1

    i(+ i yi + w, xi+ b)

    X

    i=1

    i (+ i + yi w, xi b)

    Here L is the Lagrangian and i, i ,i,i are Lagrange mul-tipliers. Hence the dual variables in (5) have to satisfy positiv-ity constraints, i.e.

    ()i , ()i 0. (6)

    Note that by ()i , we refer to i and i .

    It follows from the saddle point condition that the par-tial derivatives of L with respect to the primal variables(w, b, i, i ) have to vanish for optimality.

    bL =P

    i=1(i i) = 0 (7)

    wL = w Pi=1(i i )xi = 0 (8)()iL = C ()i ()i = 0 (9)

    Substituting (7), (8), and (9) into (5) yields the dual optimiza-tion problem.

    maximize

    8>>>: 12

    Pi,j=1

    (i i )(j j )xi, xj

    P

    i=1(i + i ) +

    Pi=1

    yi(i i )

    subject toP

    i=1(i i ) = 0 and i,i [0, C]

    (10)

    In deriving (10) we already eliminated the dual variablesi, i through condition (9) which can be reformulated as()i = C ()i . Eq. (8) can be rewritten as follows

    w =X

    i=1

    (i i )xi, thus f(x) =X

    i=1

    (i i )xi, x+ b.(11)

    This is the so-called Support Vector expansion, i.e.w can be com-pletely described as a linear combination of the training pat-terns xi. In a sense, the complexity of a functions representa-tion by SVs is independent of the dimensionality of the inputspace X, and depends only on the number of SVs.Moreover, note that the complete algorithm can be de-

    scribed in terms of dot products between the data. Even whenevaluating f(x) we need not compute w explicitly. These ob-servations will come in handy for the formulation of a nonlin-ear extension.

    2

  • 1.4 Computing b

    So far we neglected the issue of computing b. The latter can bedone by exploiting the so called KarushKuhnTucker (KKT)conditions [Karush, 1939, Kuhn and Tucker, 1951]. These statethat at the point of the solution the product between dual vari-ables and constraints has to vanish.

    i(+ i yi + w, xi+ b) = 0i (+

    i + yi w, xi b) = 0 (12)

    and(C i)i = 0(C i )i = 0. (13)

    This allows us tomake several useful conclusions. Firstly onlysamples (xi, yi) with corresponding

    ()i = C lie outside the

    insensitive tube. Secondly ii = 0, i.e. there can never bea set of dual variables i,i which are both simultaneouslynonzero. This allows us to conclude that

    yi + w, xi+ b 0 and i = 0 if i < C (14) yi + w, xi+ b 0 if i > 0 (15)

    In conjunction with an analogous analysis on i we have

    max {+ yi w, xi|i < C or i > 0} b (16)min {+ yi w, xi|i > 0 or i < C}

    If some ()i (0, C) the inequalities become equalities. Seealso [Keerthi et al., 2001] for further means of choosing b.Another way of computing b will be discussed in the con-

    text of interior point optimization (cf. Sec. 5). There b turnsout to be a by-product of the optimization process. Furtherconsiderations shall be deferred to the corresponding section.See also [Keerthi et al., 1999] for further methods to computethe constant offset.A nal note has to be made regarding the sparsity of the SV

    expansion. From (12) it follows that only for |f(xi) yi| the Lagrange multipliers may be nonzero, or in other words,for all samples inside the tube (i.e. the shaded region inFig. 1) the i,i vanish: for |f(xi) yi| < the second fac-tor in (12) is nonzero, hence i,i has to be zero such thatthe KKT conditions are satised. Therefore we have a sparseexpansion of w in terms of xi (i.e. we do not need all xi todescribew). The examples that come with nonvanishing coef-cients are called Support Vectors.

    2 Kernels

    2.1 Nonlinearity by Preprocessing

    The next step is to make the SV algorithm nonlinear. This,for instance, could be achieved by simply preprocessing thetraining patterns xi by a map : X F into some featurespace F, as described in [Aizerman et al., 1964, Nilsson, 1965]and then applying the standard SV regression algorithm. Letus have a brief look at an example given in [Vapnik, 1995].

    Example 1 (Quadratic features in R2) Consider the map :R2 R3 with (x1, x2) =

    `x21,

    2x1x2, x

    22

    . It is understood

    that the subscripts in this case refer to the components of x R2.Training a linear SV machine on the preprocessed features wouldyield a quadratic function.

    While this approach seems reasonable in the particular ex-ample above, it can easily become computationally infeasiblefor both polynomial features of higher order and higher di-mensionality, as the number of different monomial features ofdegree p is

    `d+p1

    p

    , where d = dim(X). Typical values for

    OCR tasks (with good performance) [Scholkopf et al., 1995,Scholkopf et al., 1997, Vapnik, 1995] are p = 7, d = 28 28 =784, corresponding to approximately 3.7 1016 features.

    2.2 Implicit Mapping via Kernels

    Clearly this approach is not feasible and we have to nd acomputationally cheaper way. The key observation [Boseret al., 1992] is that for the feature map of example 1 we haveD

    x21,2x1x2, x

    22

    ,x21,

    2x1x2, x

    22

    E= x, x2. (17)

    As noted in the previous section, the SV algorithm only de-pends on dot products between patterns xi. Hence it suf-ces to know k(x, x) := (x),(x) rather than explicitlywhich allows us to restate the SV optimization problem:

    maximize

    8>>>: 12

    Pi,j=1

    (i i )(j j )k(xi, xj)

    P

    i=1(i +

    i ) +

    Pi=1

    yi(i i )

    subject toP

    i=1(i i ) = 0 and i,i [0, C]

    (18)

    Likewise the expansion of f (11) may be written as

    w =X

    i=1

    (i i )(xi) and f(x) =X

    i=1

    (i i )k(xi, x) + b.(19)

    The difference to the linear case is that w is no longer givenexplicitly. Also note that in the nonlinear setting, the opti-mization problem corresponds to nding the attest functionin feature space, not in input space.

    2.3 Conditions for Kernels

    The question that arises now is, which functions k(x, x) corre-spond to a dot product in some feature space F. The followingtheorem characterizes these functions (dened on X).

    Theorem 2 (Mercer [1909]) Suppose k L(X2) such that theintegral operator Tk : L2(X) L2(X),

    Tkf() :=ZX

    k(, x)f(x)d(x) (20)

    is positive (here denotes a measure on X with (X) nite andsupp() = X). Let j L2(X) be the eigenfunction of Tk as-sociated with the eigenvalue j = 0 and normalized such thatjL2 = 1 and let j denote its complex conjugate. Then1. (j(T ))j 1.2. j L(X) and supj jL

  • 3. k(x, x) =PjN

    jj(x)j(x) holds for almost all (x, x),

    where the series converges absolutely and uniformly for almostall (x, x).

    Less formally speaking this theorem means that ifZXX

    k(x, x)f(x)f(x)dxdx 0 for all f L2(X) (21)

    holds we can write k(x, x) as a dot product in some featurespace. From this conditionwe can conclude some simple rulesfor compositions of kernels, which then also satisfy Mercerscondition [Scholkopf et al., 1999a]. In the following we willcall such functions k admissible SV kernels.

    Corollary 3 (Positive Linear Combinations of Kernels)Denote by k1, k2 admissible SV kernels and c1, c2 0 then

    k(x, x) := c1k1(x, x) + c2k2(x, x) (22)

    is an admissible kernel. This follows directly from (21) by virtue ofthe linearity of integrals.

    More generally, one can show that the set of admissible ker-nels forms a convex cone, closed in the topology of pointwiseconvergence Berg et al. [1984].

    Corollary 4 (Integrals of Kernels) Let s(x, x) be a symmetricfunction on X X such that

    k(x, x) :=ZX

    s(x, z)s(x, z)dz (23)

    exists. Then k is an admissible SV kernel.

    This can be shown directly from (21) and (23) by rearrang-ing the order of integration. We now state a necessaryand sufcient condition for translation invariant kernels, i.e.k(x, x) := k(x x) as derived in [Smola et al., 1998c].

    Theorem 5 (Products of Kernels) Denote by k1 and k2 admissi-ble SV kernels then

    k(x, x) := k1(x, x)k2(x, x) (24)

    is an admissible kernel.

    This can be seen by an application of the expan-sion part of Mercers theorem to the kernels k1 andk2 and observing that each term in the double sumP

    i,j 1i

    2j

    1i (x)

    1i (x

    )2j (x)2j (x

    ) gives rise to a positive co-efcient when checking (21).

    Theorem 6 (Smola, Scholkopf, and Muller [1998c]) A trans-lation invariant kernel k(x, x) = k(x x) is an admissible SVkernels if and only if the Fourier transform

    F [k]() = (2)d2

    ZX

    ei,xk(x)dx (25)

    is nonnegative.

    We will give a proof and some additional explanations to thistheorem in section 7. It follows from interpolation theory[Micchelli, 1986] and the theory of regularization networks[Girosi et al., 1993]. For kernels of the dotproduct type, i.e.k(x, x) = k(x, x), there exist sufcient conditions for beingadmissible.

    Theorem 7 (Burges [1999]) Any kernel of dotproduct typek(x, x) = k(x, x) has to satisfy

    k() 0, k() 0 and k() + 2k() 0 (26)

    for any 0 in order to be an admissible SV kernel.

    Note that the conditions in theorem 7 are only necessary butnot sufcient. The rules stated above can be useful tools forpractitioners both for checking whether a kernel is an admis-sible SV kernel and for actually constructing new kernels. Thegeneral case is given by the following theorem.

    Theorem 8 (Schoenberg [1942]) A kernel of dotproduct typek(x, x) = k(x, x) dened on an innite dimensional Hilbertspace, with a power series expansion

    k(t) =Xn=0

    antn (27)

    is admissible if and only if all an 0.

    A slightly weaker condition applies for nite dimensionalspaces. For further details see [Berg et al., 1984, Smola et al.,2001].

    2.4 Examples

    In [Scholkopf et al., 1998b] it has been shown, by explicitlycomputing the mapping, that homogeneous polynomial ker-nels k with p N and

    k(x, x) = x, xp (28)

    are suitable SV kernels (cf. Poggio [1975]). From this observa-tion one can conclude immediately [Boser et al., 1992, Vapnik,1995] that kernels of the type

    k(x, x) =`x, x+ cp (29)

    i.e. inhomogeneous polynomial kernels with p N, c 0are admissible, too: rewrite k as a sum of homogeneous ker-nels and apply corollary 3. Another kernel, that might seemappealing due to its resemblance to Neural Networks is thehyperbolic tangent kernel

    k(x, x) = tanh`+ x, x . (30)

    By applying theorem 8 one can check that this kernel does notactually satisfy Mercers condition [Ovari, 2000]. Curiously,the kernel has been successfully used in practice; cf. Scholkopf[1997] for a discussion of the reasons.

    4

  • Translation invariant kernels k(x, x) = k(x x) are quitewidespread. It was shown in [Aizerman et al., 1964, Micchelli,1986, Boser et al., 1992] that

    k(x, x) = exx2

    22 (31)

    is an admissible SV kernel. Moreover one can show [Smola,1996, Vapnik et al., 1997] that (1X denotes the indicator func-tion on the setX and the convolution operation)

    k(x, x) = B2n+1(x x) with Bk :=kOi=1

    1[ 12 , 12 ](32)

    Bsplines of order 2n+1, dened by the 2n+1 convolution ofthe unit inverval, are also admissible. We shall postpone fur-ther considerations to section 7 where the connection to regu-larization operators will be pointed out in more detail.

    3 Cost Functions

    So far the SV algorithm for regression may seem ratherstrange and hardly related to other existing methods of func-tion estimation (e.g. [Huber, 1981, Stone, 1985, Hardle, 1990,Hastie and Tibshirani, 1990, Wahba, 1990]). However, oncecast into a more standard mathematical notation, we will ob-serve the connections to previous work. For the sake of sim-plicity we will, again, only consider the linear case, as exten-sions to the nonlinear one are straightforward by using thekernel method described in the previous chapter.

    3.1 The Risk Functional

    Let us for a moment go back to the case of section 1.2. There,we had some training dataX := {(x1, y1), . . . , (x, y)} XR. We will assume now, that this training set has been drawniid (independent and identically distributed) from some prob-ability distribution P (x, y). Our goal will be to nd a functionf minimizing the expected risk (cf. [Vapnik, 1982])

    R[f ] =

    Zc(x, y, f(x))dP (x, y) (33)

    (c(x, y, f(x)) denotes a cost function determining howwewillpenalize estimation errors) based on the empirical data X.Given that we do not know the distribution P (x, y) we canonly useX for estimating a function f that minimizesR[f ]. Apossible approximation consists in replacing the integrationby the empirical estimate, to get the so called empirical riskfunctional

    Remp[f ] :=1

    Xi=1

    c(xi, yi, f(xi)). (34)

    A rst attempt would be to nd the empirical risk minimizerf0 := argminfH Remp[f ] for some function class H . How-ever, ifH is very rich, i.e. its capacity is very high, as for in-stance when dealing with few data in very high-dimensionalspaces, this may not be a good idea, as it will lead to over-tting and thus bad generalization properties. Hence oneshould add a capacity control term, in the SV case w2, which

    leads to the regularized risk functional [Tikhonov and Ars-enin, 1977, Morozov, 1984, Vapnik, 1982]

    Rreg[f ] := Remp[f ] +2w2 (35)

    where > 0 is a so called regularization constant. Many algo-rithms like regularization networks [Girosi et al., 1993] or neu-ral networks with weight decay networks [e.g. Bishop, 1995]minimize an expression similar to (35).

    3.2 Maximum Likelihood and Density Models

    The standard setting in the SV case is, as already mentionedin section 1.2, the -insensitive loss

    c(x, y, f(x)) = |y f(x)|. (36)It is straightforward to show that minimizing (35) with theparticular loss function of (36) is equivalent to minimizing (3),the only difference being that C = 1/().Loss functions such like |y f(x)|p with p > 1 may not

    be desirable, as the superlinear increase leads to a loss of therobustness properties of the estimator [Huber, 1981]: in thosecases the derivative of the cost function growswithout bound.For p < 1, on the other hand, c becomes nonconvex.For the case of c(x, y, f(x)) = (y f(x))2 we recover the

    least mean squares t approach, which, unlike the standardSV loss function, leads to a matrix inversion instead of aquadratic programming problem.The question is which cost function should be used in (35).

    On the one hand we will want to avoid a very complicatedfunction c as this may lead to difcult optimization problems.On the other hand one should use that particular cost functionthat suits the problem best. Moreover, under the assumptionthat the samples were generated by an underlying functionaldependency plus additive noise, i.e. yi = ftrue(xi) + i withdensity p(), then the optimal cost function in a maximumlikelihood sense is

    c(x, y, f(x)) = log p(y f(x)). (37)This can be seen as follows. The likelihood of an estimate

    Xf := {(x1, f(x1)), . . . , (x, f(x))} (38)for additive noise and iid data is

    p(Xf |X) =Y

    i=1

    p(f(xi)|(xi, yi)) =Y

    i=1

    p(yi f(xi)). (39)

    Maximizing P (Xf |X) is equivalent to minimizing logP (Xf |X). By using (37) we get

    logP (Xf |X) =X

    i=1

    c(xi, yi, f(xi)). (40)

    However, the cost function resulting from this reasoningmight be nonconvex. In this case one would have to nd aconvex proxy in order to deal with the situation efciently (i.e.to nd an efcient implementation of the corresponding opti-mization problem).

    5

  • If, on the other hand, we are given a specic cost functionfrom a real world problem, one should try to nd as close aproxy to this cost function as possible, as it is the performancewrt. this particular cost function that matters ultimately.

    Table 1 contains an overview over some common densitymodels and the corresponding loss functions as dened by(37).

    The only requirement we will impose on c(x, y, f(x)) in thefollowing is that for xed x and y we have convexity in f(x).This requirement is made, as we want to ensure the existenceand uniqueness (for strict convexity) of a minimum of opti-mization problems [Fletcher, 1989].

    3.3 Solving the Equations

    For the sake of simplicity we will additionally assume c to besymmetric and to have (at most) two (for symmetry) discon-tinuities at , 0 in the rst derivative, and to be zero inthe interval [, ]. All loss functions from table 1 belong tothis class. Hence c will take on the following form.

    c(x, y, f(x)) = c(|y f(x)|) (41)

    Note the similarity to Vapniks insensitive loss. It is ratherstraightforward to extend this special choice to more generalconvex cost functions. For nonzero cost functions in the in-terval [, ] use an additional pair of slack variables. More-over we might choose different cost functions ci, ci and dif-ferent values of i, i for each sample. At the expense of ad-ditional Lagrange multipliers in the dual formulation addi-tional discontinuities also can be taken care of. Analogouslyto (3) we arrive at a convex minimization problem [Smola andScholkopf, 1998a]. To simplify notation we will stick to theone of (3) and use C instead of normalizing by and .

    minimize 12w2 + CP

    i=1(c(i) + c(

    i ))

    subject to

    8>>>>>>>>>>:

    12P

    i,j=1(i i )(j j )xi, xj

    +P

    i=1yi(i i ) (i + i )

    +CP

    i=1T (i) + T (

    i )

    where

    8>>>>>>:P

    i=1(i i ) = 0

    C c() = inf{ |C c }, 0

    (43)

    3.4 Examples

    Let us consider the examples of table 1. We will show ex-plicitly for two examples how (43) can be further simpliedto bring it into a form that is practically useful. In the insensitive case, i.e. c() = || we get

    T () = 1 = 0. (44)

    Morover one can conclude from c() = 1 that

    = inf{ |C } = 0 and [0, C] . (45)

    For the case of piecewise polynomial loss we have to distin-guish two different cases: and > . In the rst casewe get

    T () =1

    pp1p 1

    p1p = p 1

    p1pp (46)

    and = inf{ |C1pp1 } = C 1p1 1p1 and thus

    T () = p 1p

    Cp

    p1p

    p1 . (47)

    In the second case ( ) we have

    T () = p 1p

    = p 1p

    (48)

    and = inf{ |C } = , which, in turn yields [0, C].Combining both cases we have

    [0, C] and T () = p 1p

    Cp

    p1p

    p1 . (49)

    Table 2 contains a summary of the various conditions on andformulas for T () (strictly speaking T (())) for different costfunctions.5 Note that the maximum slope of c determines theregion of feasibility of , i.e. s := supR+ c() < leadsto compact intervals [0, Cs] for . This means that the inu-ence of a single pattern is bounded, leading to robust estima-tors [Huber, 1972]. One can also observe experimentally that

    6

  • loss function density modelinsensitive c() = || p() = 12(1+) exp(||)Laplacian c() = || p() = 12 exp(||)Gaussian c() = 12

    2 p() = 12

    exp( 22 )Hubers robust loss c() =

    12 ()

    2 if || || 2 otherwise

    p()

    exp( 22 ) if || exp(2 ||) otherwise

    Polynomial c() = 1p ||p p() = p2(1/p) exp(||p)

    Piecewise polynomial c() =

    (1

    pp1 ()p if ||

    || p1p otherwisep()

    (exp( p

    pp1 ) if || exp( p1p ||) otherwise

    Table 1: Common loss functions and corresponding density models

    CT ()insensitive = 0 [0, C] 0Laplacian = 0 [0, C] 0Gaussian = 0 [0,) 12C12Hubersrobust loss = 0 [0, C]

    12C

    12

    Polynomial = 0 [0,) p1p C1

    p1p

    p1

    Piecewisepolynomial = 0 [0, C]

    p1p C

    1p1p

    p1

    Table 2: Terms of the convex optimization problemdependingon the choice of the loss function.

    the performance of a SV machine depends signicantly on thecost function used [Muller et al., 1997, Smola et al., 1998b].A cautionary remark is necessary regarding the use of cost

    functions other than the insensitive one. Unless = 0 wewill lose the advantage of a sparse decomposition. This maybe acceptable in the case of few data, but will render the pre-diction step extremely slow otherwise. Hence one will haveto trade off a potential loss in prediction accuracy with fasterpredictions. Note, however, that also a reduced set algorithmlike in [Burges, 1996, Burges and Scholkopf, 1997, Scholkopfet al., 1999b] or sparse decomposition techniques [Smola andScholkopf, 2000] could be applied to address this issue. In aBayesian setting, Tipping [2000] has recently shown how anL2 cost function can be used without sacricing sparsity.

    4 The Bigger Picture

    Before delving into algorithmic details of the implementationlet us briey review the basic properties of the SV algorithmfor regression as described so far. Figure 2 contains a graphi-cal overview over the different steps in the regression stage.The input pattern (for which a prediction is to be made)

    is mapped into feature space by a map . Then dot prod-ucts are computed with the images of the training patternsunder the map . This corresponds to evaluating kernel func-tions k(xi, x). Finally the dot products are added up usingthe weights i = ii . This, plus the constant term b yieldsthe nal prediction output. The process described here is very

    5The table displaysCT () instead of T () since the former can be pluggeddirectly into the corresponding optimization equations.

    output i k (x,xi) + b

    weights l

    test vector x

    support vectors x1 ... xl

    mapped vectors (xi), (x) (x) (xl)

    dot product ( (x). (xi)) = k (x,xi)( . ) ( . ) ( . )

    (x1) (x2)

    Figure 2: Architecture of a regressionmachine constructed bythe SV algorithm.

    similar to regression in a neural network, with the difference,that in the SV case the weights in the input layer are a subsetof the training patterns.Figure 3 demonstrates how the SV algorithm chooses the

    attest function among those approximating the original datawith a given precision. Although requiring atness only infeature space, one can observe that the functions also are veryat in input space. This is due to the fact, that kernels can beassociated with atness properties via regularization opera-tors. This will be explained in more detail in section 7.Finally Fig. 4 shows the relation between approximation

    quality and sparsity of representation in the SV case. Thelower the precision required for approximating the originaldata, the fewer SVs are needed to encode that. The non-SVsare redundant, i.e. even without these patterns in the trainingset, the SV machine would have constructed exactly the samefunction f . One might think that this could be an efcientway of data compression, namely by storing only the sup-port patterns, from which the estimate can be reconstructedcompletely. However, this simple analogy turns out to fail inthe case of high-dimensional data, and even more drasticallyin the presence of noise. In [Vapnik et al., 1997] one can seethat even for moderate approximation quality, the number ofSVs can be considerably high, yielding rates worse than theNyquist rate [Nyquist, 1928, Shannon, 1948].

    7

  • -1

    -0.8

    -0.6

    -0.4

    -0.2

    0

    0.2

    0.4

    0.6

    0.8

    1

    -1 -0.8 -0.6 -0.4 -0.2 0 0.2 0.4 0.6 0.8 1

    sinc x + 0.1sinc x - 0.1

    approximation

    -1

    -0.8

    -0.6

    -0.4

    -0.2

    0

    0.2

    0.4

    0.6

    0.8

    1

    -1 -0.8 -0.6 -0.4 -0.2 0 0.2 0.4 0.6 0.8 1

    sinc x + 0.2sinc x - 0.2

    approximation

    -1

    -0.8

    -0.6

    -0.4

    -0.2

    0

    0.2

    0.4

    0.6

    0.8

    1

    -1 -0.8 -0.6 -0.4 -0.2 0 0.2 0.4 0.6 0.8 1

    sinc x + 0.5sinc x - 0.5

    approximation

    Figure 3: Left to right: approximation of the function sinc xwith precisions = 0.1, 0.2, and 0.5. The solid top and the bottomlines indicate the size of the tube, the dotted line in between is the regression.

    Figure 4: Left to right: regression (solid line), datapoints (small dots) and SVs (big dots) for an approximationwith = 0.1, 0.2,and 0.5. Note the decrease in the number of SVs.

    5 Optimization Algorithms

    While there has been a large number of implementations ofSV algorithms in the past years, we focus on a few algorithmswhich will be presented in greater detail. This selection issomewhat biased, as it contains these algorithms the authorsare most familiar with. However, we think that this overviewcontains some of the most effective ones and will be useful forpractitioners who would like to actually code a SV machineby themselves. But before doing so we will briey cover ma-jor optimization packages and strategies.

    5.1 Implementations

    Most commercially available packages for quadratic program-ming can also be used to train SVmachines. These are usuallynumerically very stable general purpose codes, with specialenhancements for large sparse systems. While the latter is afeature that is not needed at all in SV problems (there the dotproduct matrix is dense and huge) they still can be used withgood success.6

    OSL This package was written by [IBM Corporation, 1992].It uses a two phase algorithm. The rst step consists ofsolving a linear approximation of the QP problem by thesimplex algorithm [Dantzig, 1962]. Next a related verysimple QP problem is dealt with. When successive ap-

    6The high price tag usually is the major deterrent for not using them. More-over one has to bear in mind that in SV regression, one may speed up the so-lution considerably by exploiting the fact that the quadratic form has a specialstructure or that there may exist rank degeneracies in the kernel matrix itself.

    proximations are close enough together, the second sub-algorithm, which permits a quadratic objective and con-verges very rapidly from a good starting value, is used.Recently an interior point algorithm was added to thesoftware suite.

    CPLEX by CPLEX Optimization Inc. [1994] uses a primal-dual logarithmic barrier algorithm [Megiddo, 1989] in-stead with predictor-corrector step (see eg. [Lustig et al.,1992, Mehrotra and Sun, 1992]).

    MINOS by the Stanford Optimization Laboratory [Murtaghand Saunders, 1983] uses a reduced gradient algorithmin conjunction with a quasi-Newton algorithm. The con-straints are handled by an active set strategy. Feasibilityis maintained throughout the process. On the active con-straint manifold, a quasiNewton approximation is used.

    MATLAB Until recently the matlab QP optimizer deliveredonly agreeable, although below average performance onclassication tasks and was not all too useful for regres-sion tasks (for problems much larger than 100 samples)due to the fact that one is effectively dealing with an op-timization problem of size 2 where at least half of theeigenvalues of the Hessian vanish. These problems seemto have been addressed in version 5.3 / R11. Matlab nowuses interior point codes.

    LOQO by Vanderbei [1994] is another example of an interiorpoint code. Section 5.3 discusses the underlying strate-gies in detail and shows how they can be adapted to SValgorithms.

    8

  • MaximumMargin Perceptron by Kowalczyk [2000] is an al-gorithm specically tailored to SVs. Unlike most othertechniques it works directly in primal space and thus doesnot have to take the equality constraint on the Lagrangemultipliers into account explicitly.

    Iterative Free Set Methods The algorithm by Kaufman[Bunch et al., 1976, Bunch and Kaufman, 1977, 1980,Drucker et al., 1997, Kaufman, 1999], uses such a tech-nique starting with all variables on the boundary andadding them as the Karush Kuhn Tucker conditionsbecome more violated. This approach has the advantageof not having to compute the full dot product matrixfrom the beginning. Instead it is evaluated on the y,yielding a performance improvement in comparisonto tackling the whole optimization problem at once.However, also other algorithms can be modied bysubset selection techniques (see section 5.5) to addressthis problem.

    5.2 Basic Notions

    Most algorithms rely on results from the duality theory in con-vex optimization. Although we already happened to mentionsome basic ideas in section 1.2 we will, for the sake of conve-nience, briey review without proof the core results. Theseare needed in particular to derive an interior point algorithm.For details and proofs see e.g. [Fletcher, 1989].

    Uniqueness Every convex constrained optimization problemhas a unique minimum. If the problem is strictly convexthen the solution is unique. This means that SVs are notplagued with the problem of local minima as Neural Net-works are.7

    Lagrange Function The Lagrange function is given by theprimal objective function minus the sum of all productsbetween constraints and corresponding Lagrange mul-tipliers (cf. e.g. [Fletcher, 1989, Bertsekas, 1995]). Opti-mization can be seen as minimzation of the Lagrangianwrt. the primal variables and simultaneousmaximizationwrt. the Lagrange multipliers, i.e. dual variables. It has asaddle point at the solution. Usually the Lagrange func-tion is only a theoretical device to derive the dual objec-tive function (cf. Sec. 1.2).

    Dual Objective Function It is derived by minimizing the La-grange function with respect to the primal variables andsubsequent elimination of the latter. Hence it can be writ-ten solely in terms of the dual variables.

    Duality Gap For both feasible primal and dual variablesthe primal objective function (of a convex minimizationproblem) is always greater or equal than the dual objec-tive function. Since SVMs have only linear constraints

    7For large and noisy problems (e.g. 100.000 patterns and more with a sub-stantial fraction of nonbound Lagrange multipliers) it is impossible to solve theproblem exactly: due to the size one has to use subset selection algorithms,hence joint optimization over the training set is impossible. However, unlike inNeural Networks, we can determine the closeness to the optimum. Note thatthis reasoning only holds for convex cost functions.

    the constraint qualications of the strong duality theo-rem [Bazaraa et al., 1993, Theorem 6.2.4] are satised andit follows that gap vanishes at optimality. Thus the dual-ity gap is a measure how close (in terms of the objectivefunction) the current set of variables is to the solution.

    KarushKuhnTucker (KKT) conditions A set of primal anddual variables that is both feasible and satises the KKTconditions is the solution (i.e. constraint dual variable=0). The sum of the violated KKT terms determines exactlythe size of the duality gap (that is, we simply compute theconstraint Lagrangemultiplier part as done in (55)). Thisallows us to compute the latter quite easily.

    A simple intuition is that for violated constraints the dualvariable could be increased arbitrarily, thus rendering theLagrange function arbitrarily large. This, however, is incontradition to the saddlepoint property.

    5.3 Interior Point Algorithms

    In a nutshell the idea of an interior point algorithm is to com-pute the dual of the optimization problem (in our case thedual dual of Rreg[f ]) and solve both primal and dual simul-taneously. This is done by only gradually enforcing the KKTconditions to iteratively nd a feasible solution and to usethe duality gap between primal and dual objective functionto determine the quality of the current set of variables. Thespecial avour of algorithm we will describe is primaldualpathfollowing [Vanderbei, 1994].In order to avoid tedious notation we will consider the

    slightly more general problem and specialize the result to theSVM later. It is understood that unless stated otherwise, vari-ables like denote vectors and i denotes its ith component.

    minimize 12 q() + c,subject to A = b and l u (50)

    with c,, l, u Rn, A Rnm, b Rm, the inequalities be-tween vectors holding componentwise and q() being a con-vex function of . Now we will add slack variables to get ridof all inequalities but the positivity constraints. This yields:

    minimize 12q() + c,subject to A = b, g = l, + t = u,

    g, t 0, free(51)

    The dual of (51) is

    maximize 12 (q() q(),)+ b, y+ l, z u, ssubject to 12 q() + c (Ay) + s = z, s, z 0, y free

    (52)Moreover we get the KKT conditions, namely

    gizi = 0 and siti = 0 for all i [1 . . . n]. (53)A necessary and sufcient condition for the optimal solutionis that the primal / dual variables satisfy both the feasibil-ity conditions of (51) and (52) and the KKT conditions (53).We proceed to solve (51) (53) iteratively. The details can befound in appendix A.

    9

  • 5.4 Useful Tricks

    Before proceeding to further algorithms for quadratic opti-mization let us briey mention some useful tricks that canbe applied to all algorithms described subsequently and mayhave signicant impact despite their simplicity. They are inpart derived from ideas of the interior-point approach.

    Training with Different Regularization Parameters For sev-eral reasons (model selection, controlling the number ofsupport vectors, etc.) it may happen that one has to traina SVmachine with different regularization parametersC,but otherwise rather identical settings. If the parametersCnew = Cold is not too different it is advantageous touse the rescaled values of the Lagrange multipliers (i.e.i,

    i ) as a starting point for the new optimization prob-

    lem. Rescaling is necessary to satisfy the modied con-straints. One gets

    new = old and likewise bnew = bold. (54)

    Assuming that the (dominant) convex part q() of theprimal objective is quadratic, the q scales with 2 whereas the linear part scales with . However, since the lin-ear term dominates the objective function, the rescaledvalues are still a better starting point than = 0. Inpractice a speedup of approximately 95% of the overalltraining time can be observed when using the sequen-tial minimization algorithm, cf. [Smola, 1998]. A similarreasoning can be applied when retraining with the sameregularization parameter but different (yet similar) widthparameters of the kernel function. See [Cristianini et al.,1998] for details thereon in a different context.

    Monitoring Convergence via the Feasibility Gap In thecase of both primal and dual feasible variables thefollowing connection between primal and dual objectivefunction holds:

    Dual Obj. = Primal Obj.Xi

    (gizi + siti) (55)

    This can be seen immediately by the construction of theLagrange function. In Regression Estimation (with theinsensitive loss function) one obtains for

    Pi gizi + siti

    Xi

    2664+max(0, f(xi) (yi + i))(C i )min(0, f(xi) (yi + i))i+max(0, (yi i ) f(xi))(C i)min(0, (yi i ) f(xi))i

    3775 . (56)Thus convergence with respect to the point of the solu-tion can be expressed in terms of the duality gap. Aneffective stopping rule is to requireP

    i gizi + siti|Primal Objective|+ 1 tol (57)

    for some precision tol. This condition is much in thespirit of primal dual interior point path following algo-rithms, where convergence is measured in terms of thenumber of signicant gures (which would be the dec-imal logarithm of (57)), a convention that will also beadopted in the subsequent parts of this exposition.

    5.5 Subset Selection Algorithms

    The convex programming algorithms described so far canbe used directly on moderately sized (up to 3000) samplesdatasets without any further modications. On large datasets,however, it is difcult, due to memory and cpu limitations,to compute the dot product matrix k(xi, xj) and keep it inmemory. A simple calculation shows that for instance stor-ing the dot product matrix of the NIST OCR database (60.000samples) at single precision would consume 0.7 GBytes. ACholesky decomposition thereof, which would additionallyrequire roughly the same amount of memory and 64 Teraops(counting multiplies and adds separately), seems unrealistic,at least at current processor speeds.A rst solution, which was introduced in [Vapnik, 1982] re-

    lies on the observation that the solution can be reconstructedfrom the SVs alone. Hence, if we knew the SV set beforehand,and it tted into memory, then we could directly solve the re-duced problem. The catch is that we do not know the SV setbefore solving the problem. The solution is to start with anarbitrary subset, a rst chunk that ts into memory, train theSV algorithm on it, keep the SVs and ll the chunk up withdata the current estimator would make errors on (i.e. data ly-ing outside the tube of the current regression). Then retrainthe system and keep on iterating until after training allKKT conditions are satised.The basic chunking algorithm just postponed the underly-

    ing problemof dealingwith large datasets whose dotproductmatrix cannot be kept in memory: it will occur for larger train-ing set sizes than originally, but it is not completely avoided.Hence the solution is [Osuna et al., 1997] to use only a subsetof the variables as a working set and optimize the problemwith respect to them while freezing the other variables. Thismethod is described in detail in [Osuna et al., 1997, Joachims,1999, Saunders et al., 1998] for the case of pattern recognition.8

    An adaptation of these techniques to the case of regressionwith convex cost functions can be found in appendix B. Thebasic structure of the method is described by algorithm 1.

    Algorithm 1 Basic structure of a working set algorithm.Initialize i,i = 0Choose arbitrary working set SwrepeatCompute coupling terms (linear and constant) forSw (seeAppendix B)Solve reduced optimization problemChoose new Sw from variables i,i not satisfying theKKT conditions

    untilworking set Sw =

    5.6 SequentialMinimal Optimization

    Recently an algorithm Sequential Minimal Optimization(SMO)was proposed [Platt, 1999] that puts chunking to the

    8A similar technique was employed by Bradley and Mangasarian [1998] inthe context of linear programming in order to deal with large datasets.

    10

  • extreme by iteratively selecting subsets only of size 2 and op-timizing the target function with respect to them. It has beenreported to have good convergence properties and it is easilyimplemented. The key point is that for a working set of 2 theoptimization subproblem can be solved analytically withoutexplicitly invoking a quadratic optimizer.While readily derived for pattern recognition by Platt

    [1999], one simply has to mimick the original reasoning toobtain an extension to Regression Estimation. This is whatwill be done in Appendix C (the pseudocode can be found in[Smola and Scholkopf, 1998b]). The modications consist ofa pattern dependent regularization, convergence control viathe number of signicant gures, and a modied system ofequations to solve the optimization problem in two variablesfor regression analytically.Note that the reasoning only applies to SV regression with

    the insensitive loss function for most other convex costfunctions an explicit solution of the restricted quadratic pro-gramming problem is impossible. Yet, one could derive ananalogous non-quadratic convex optimization problem forgeneral cost functions but at the expense of having to solveit numerically.The exposition proceeds as follows: rst one has to derive

    the (modied) boundary conditions for the constrained 2 in-dices (i, j) subproblem in regression, next one can proceed tosolve the optimization problem analytically, and nally onehas to check, which part of the selection rules have to be mod-ied to make the approach work for regression. Since most ofthe content is fairly technical it has been relegated to appendixC.The main difference in implementations of SMO for regres-

    sion can be found in the way the constant offset b is deter-mined [Keerthi et al., 1999] and which criterion is used to se-lect a new set of variables. We present one such strategy inappendix C.3. However, since selection strategies are the fo-cus of current research we recommend that readers interestedin implementing the algorithm make sure they are aware ofthe most recent developments in this area.Finally, we note that just as we presently describe a general-

    ization of SMO to regression estimation, other learning prob-lems can also benet from the underlying ideas. Recently,a SMO algorithm for training novelty detection systems (i.e.one-class classication) has been proposed [Scholkopf et al.,2001].

    6 Variations on a Theme

    There exists a large number of algorithmic modications ofthe SV algorithm, to make it suitable for specic settings (in-verse problems, semiparametric settings), different ways ofmeasuring capacity and reductions to linear programming(convex combinations) and different ways of controlling ca-pacity. We will mention some of the more popular ones.

    6.1 Convex Combinations and 1norms

    All the algorithms presented so far involved convex, and atbest, quadratic programming. Yet onemight think of reducing

    the problem to a case where linear programming techniquescan be applied. This can be done in a straightforward fash-ion [Mangasarian, 1965, 1968, Weston et al., 1999, Smola et al.,1999] for both SV pattern recognition and regression. The keyis to replace (35) by

    Rreg[f ] := Remp[f ] + 1 (58)where 1 denotes the 1 norm in coefcient space. Henceone uses the SV kernel expansion (11)

    f(x) =X

    i=1

    ik(xi, x) + b

    with a different way of controlling capacity by minimizing

    Rreg[f ] =1

    Xi=1

    c(xi, yi, f(xi)) + X

    i=1

    |i|. (59)

    For the insensitive loss function this leads to a linear pro-gramming problem. In the other cases, however, the problemstill stays a quadratic or general convex one, and thereforemay not yield the desired computational advantage. There-fore we will limit ourselves to the derivation of the linear pro-gramming problem in the case of | | cost function. Reformu-lating (59) yields

    minimizeP

    i=1(i +

    i ) + C

    Pi=1

    (i + i )

    subject to

    8>>>>>>>:yi

    Pj=1

    (j j )k(xj , xi) b + iP

    j=1(j j )k(xj , xi) + b yi + i

    i,i , i,

    i 0

    Unlike in the classical SV case, the transformation into itsdual does not give any improvement in the structure of theoptimization problem. Hence it is best to minimize Rreg[f ]directly, which can be achieved by a linear optimizer, e.g.[Dantzig, 1962, Lustig et al., 1990, Vanderbei, 1997].In [Weston et al., 1999] a similar variant of the linear SV ap-

    proach is used to estimate densities on a line. One can show[Smola et al., 2000] that one may obtain bounds on the gen-eralization error which exhibit even better rates (in terms ofthe entropy numbers) than the classical SV case [Williamsonet al., 1998].

    6.2 Automatic Tuning of the Insensitivity Tube

    Besides standard model selection issues, i.e. how to spec-ify the trade-off between empirical error and model capacitythere also exists the problem of an optimal choice of a costfunction. In particular, for the -insensitive cost function westill have the problem of choosing an adequate parameter inorder to achieve good performance with the SV machine.Smola et al. [1998a] show the existence of a linear depen-

    dency between the noise level and the optimal -parameterfor SV regression. However, this would require that we knowsomething about the noise model. This knowledge is not

    11

  • available in general. Therefore, albeit providing theoreticalinsight, this nding by itself is not particularly useful in prac-tice. Moreover, if we really knew the noise model, we mostlikelywould not choose the insensitive cost function but thecorresponding maximum likelihood loss function instead.There exists, however, a method to construct SV machines

    that automatically adjust and moreover also, at least asymp-totically, have a predetermined fraction of sampling points asSVs [Scholkopf et al., 2000]. We modify (35) such that be-comes a variable of the optimization problem, including anextra term in the primal objective function which attempts tominimize . In other words

    minimize R [f ] := Remp[f ] +2w2 + (60)

    for some > 0. Hence (42) becomes (again carrying out theusual transformation between , and C)

    minimize 12w2 + C

    Pi=1

    (c(i) + c(i )) +

    subject to

    8>>: 12

    Pi,j=1

    (i i )(j j )k(xi, xj)

    +P

    i=1yi(i i )

    subject to

    8>>>>>>>:P

    i=1(i i ) = 0

    Pi=1

    (i + i ) C

    i,i [0, C](62)

    Note that the optimization problem is thus very similar to the-SV one: the target function is even simpler (it is homoge-neous), but there is an additional constraint. For informationon how this affects the implementation, cf. [Chang and Lin,2001].Besides having the advantage of being able to automatically

    determine , (62) also has another advantage. It can be usedto prespecify the number of SVs:

    Theorem 9 (Scholkopf et al. [2000])

    1. is an upper bound on the fraction of errors.

    2. is a lower bound on the fraction of SVs.

    3. Suppose the data has been generated iid from a distributionp(x, y) = p(x)p(y|x) with a continuous conditional distribu-tion p(y|x). With probability 1, asymptotically, equals thefraction of SVs and the fraction of errors.

    Essentially, -SV regression improves upon -SV regressionby allowing the tube width to adapt automatically to the data.What is kept xed up to this point, however, is the shape of thetube. One can, however, go one step further and use paramet-ric tube models with non-constant width, leading to almostidentical optimization problems [Scholkopf et al., 2000].Combining -SV regression with results on the asymptoti-

    cal optimal choice of for a given noise model [Smola et al.,1998a] leads to a guideline how to adjust provided the classof noise models (e.g. Gaussian or Laplacian) is known.

    Remark 10 (Optimal Choice of ) Denote by p a probabilitydensity with unit variance, and by P a famliy of noise models gen-erated from p by P :=

    pp = 1 p

    `y

    . Moreover assume that

    the data were drawn iid from p(x, y) = p(x)p(y f(x)) withp(y f(x)) continuous. Then under the assumption of uniformconvergence, the asymptotically optimal value of is

    = 1 R p(t)dtwhere := argmin (p( ) + p( ))2

    1 R p(t)dt

    (63)

    For polynomial noise models, i.e. densities of type exp(||p)onemay compute the corresponding (asymptotically) optimalvalues of . They are given in gure 5. For further details see[Scholkopf et al., 2000, Smola, 1998]; an experimental valida-tion has been given by Chalimourda et al. [2000].

    1 2 3 4 5 6 7 8 9 100

    0.2

    0.4

    0.6

    0.8

    1

    1.2

    1.4

    1.6

    1.8

    Polynomial degree p

    Optimal Q Optimal H for unit variance

    Figure 5: Optimal and for various degrees of polynomialadditive noise.

    We conclude this section by noting that -SV regression is re-lated to the idea of trimmed estimators. One can show thatthe regression is not inuenced if we perturb points lying out-side the tube. Thus, the regression is essentially computed bydiscarding a certain fraction of outliers, specied by , andcomputing the regression estimate from the remaining points[Scholkopf et al., 2000].

    12

  • 7 Regularization

    So far we were not concerned about the specic properties ofthe map into feature space and used it only as a convenienttrick to construct nonlinear regression functions. In somecases the map was just given implicitly by the kernel, hencethe map itself and many of its properties have been neglected.A deeper understanding of the kernel map would also be use-ful to choose appropriate kernels for a specic task (e.g. byincorporating prior knowledge [Scholkopf et al., 1998a]). Fi-nally the feature map seems to defy the curse of dimensional-ity [Bellman, 1961] by making problems seemingly easier yetreliable via a map into some even higher dimensional space.In this section we focus on the connections between SV

    methods and previous techniques like Regularization Net-works [Girosi et al., 1993].9 In particular we will show thatSV machines are essentially Regularization Networks (RN)with a clever choice of cost functions and that the kernels areGreens function of the corresponding regularization opera-tors. For a full exposition of the subject the reader is referredto [Smola et al., 1998c].

    7.1 RegularizationNetworks

    Let us briey review the basic concepts of RNs. As in (35)we minimize a regularized risk functional. However, ratherthan enforcing atness in feature space we try to optimize somesmoothness criterion for the function in input space. Thus weget

    Rreg[f ] := Remp[f ] +2Pf2. (64)

    Here P denotes a regularization operator in the sense of[Tikhonov and Arsenin, 1977], i.e. P is a positive semideniteoperatormapping from the Hilbert spaceH of functions f un-der consideration to a dot product space D such that the ex-pression Pf Pg is well dened for f, g H . For instance bychoosing a suitable operator that penalizes large variations off one can reduce the wellknown overtting effect. Anotherpossible setting also might be an operator P mapping fromL2(Rn) into some Reproducing Kernel Hilbert Space (RKHS)[Aronszajn, 1950, Kimeldorf and Wahba, 1971, Saitoh, 1988,Scholkopf, 1997, Girosi, 1998].Using an expansion of f in terms of some symmetric func-

    tion k(xi,xj) (note here, that k need not fulll Mercers condi-tion and can be chosen arbitrarily since it is not used to denea regularization term),

    f(x) =X

    i=1

    ik(xi, x) + b, (65)

    and the insensitive cost function, this leads to a quadraticprogramming problem similar to the one for SVs. Using

    Dij := (Pk)(xi, .) (Pk)(xj , .) (66)9Due to length constraints we will not deal with the connection between

    Gaussian Processes and SVMs. See Williams [1998] for an excellent overview.

    we get = D1K( ), with , being the solution of

    minimize 12 ( )KD1K( )

    ( )y P

    i=1(i +

    i )

    subject toP

    i=1(i i ) = 0 and i,i [0, C].

    (67)

    Unfortunately, this setting of the problem does not preservesparsity in terms of the coefcients, as a potentially sparse de-composition in terms of i and i is spoiled byD1K, whichis not in general diagonal.

    7.2 Greens Functions

    Comparing (10) with (67) leads to the question whether andunder which condition the two methods might be equivalentand therefore also under which conditions regularization net-works might lead to sparse decompositions, i.e. only a fewof the expansion coefcients i in f would differ from zero.A sufcient condition isD = K and thus KD1K = K (ifKdoes not have full rank we only need thatKD1K = K holdson the image ofK):

    k(xi, xj) = (Pk)(xi, .) (Pk)(xj, .) (68)

    Our goal now is to solve the following two problems:

    1. Given a regularization operator P , nd a kernel k suchthat a SV machine using k will not only enforce atnessin feature space, but also correspond tominimizing a reg-ularized risk functional with P as regularizer.

    2. Given an SV kernel k, nd a regularization operator Psuch that a SV machine using this kernel can be viewedas a Regularization Network using P .

    These two problems can be solved by employing the conceptof Greens functions as described in [Girosi et al., 1993]. Thesefunctions were introduced for the purpose of solving differen-tial equations. In our context it is sufcient to know that theGreens functions Gxi(x) of P

    P satisfy

    (P PGxi)(x) = xi(x). (69)

    Here, xi(x) is the distribution (not to be confused with theKronecker symbol ij) which has the property that f xi =f(xi). The relationship between kernels and regularizationoperators is formalized in the following proposition:

    Proposition 11 (Smola, Scholkopf, and Muller [1998b])Let P be a regularization operator, andG be the Greens function ofP P . Then G is a Mercer Kernel such that D = K. SV machinesusingG minimize risk functional (64) with P as regularization op-erator.

    In the followingwe will exploit this relationship in both ways:to compute Greens functions for a given regularization oper-ator P and to infer the regularizer, given a kernel k.

    13

  • 7.3 Translation Invariant Kernels

    Let us now more specically consider regularization opera-tors P that may be written as multiplications in Fourier space

    Pf Pg = 1(2)n/2

    Z

    f()g()P ()

    d (70)

    with f() denoting the Fourier transform of f(x), and P () =P () real valued, nonnegative and converging to 0 for|| and := supp[P ()]. Small values of P () cor-respond to a strong attenuation of the corresponding frequen-cies. Hence small values of P () for large are desirable sincehigh frequency components of f correspond to rapid changesin f . P () describes the lter properties of P P . Note thatno attenuation takes place for P () = 0 as these frequencieshave been excluded from the integration domain.For regularization operators dened in Fourier Space by

    (70) one can show by exploiting P () = P () = P () that

    G(xi, x) =1

    (2)n/2

    ZRn

    ei(xix)P ()d (71)

    is a corresponding Greens function satisfying translationalinvariance, i.e.

    G(xi, xj) = G(xi xj) and G() = P (). (72)This provides us with an efcient tool for analyzing SV ker-nels and the types of capacity control they exhibit. In fact theabove is a special case of Bochners theorem [Bochner, 1959]stating that the Fourier transform of a positive measure con-stitutes a positive Hilbert Schmidt kernel.

    Example 12 (Gaussian kernels)Following the exposition of [Yuille and Grzywacz, 1988] as de-scribed in [Girosi et al., 1993], one can see that for

    Pf2 =ZdxXm

    2m

    m!2m(Omf(x))2 (73)

    with O2m = m and O2m+1 = m, being the Laplacian and the Gradient operator, we get Gaussians kernels (31). Moreover,we can provide an equivalent representation of P in terms of its

    Fourier properties, i.e. P () = e22

    2 up to a multiplicativeconstant.

    Training an SV machine with Gaussian RBF kernels[Scholkopf et al., 1997] corresponds to minimizing the speciccost function with a regularization operator of type (73). Re-call that (73) means that all derivatives of f are penalized (wehave a pseudodifferential operator) to obtain a very smoothestimate. This also explains the good performance of SV ma-chines in this case, as it is by nomeans obvious that choosing aat function in some high dimensional space will correspondto a simple function in low dimensional space, as shown in[Smola et al., 1998c] for Dirichlet kernels.The question that arises now is which kernel to choose. Let

    us think about two extreme situations.

    1. Suppose we already knew the shape of the power spec-trum Pow() of the function we would like to estimate.In this case we choose k such that k matches the powerspectrum [Smola, 1998].

    2. If we happen to know very little about the given data ageneral smoothness assumption is a reasonable choice.Hence we might want to choose a Gaussian kernel. Ifcomputing time is important one might moreover con-sider kernels with compact support, e.g. using the Bqspline kernels (cf. (32)). This choice will cause many ma-trix elements kij = k(xi xj) to vanish.

    The usual scenario will be in between the two extreme casesand wewill have some limited prior knowledge available. Formore information on using prior knowledge for choosing ker-nels see [Scholkopf et al., 1998a].

    7.4 Capacity Control

    All the reasoning so far was based on the assumption thatthere exist ways to determine model parameters like the reg-ularization constant or length scales of rbfkernels. Themodel selection issue itself would easily double the length ofthis review and moreover it is an area of active and rapidlymoving research. Therefore we limit ourselves to a presenta-tion of the basic concepts and refer the interested reader to theoriginal publications.It is important to keep in mind that there exist several fun-

    damentally different approaches such as Minimum Descrip-tion Length (cf. e.g. [Rissanen, 1978, Li and Vitanyi, 1993])which is based on the idea that the simplicity of an estimate,and therefore also its plausibility is based on the information(number of bits) needed to encode it such that it can be recon-structed.Bayesian estimation, on the other hand, considers the

    posterior probability of an estimate, given the observationsX = {(x1, y1), . . . (x, y)}, an observation noise model, anda prior probability distribution p(f) over the space of esti-mates (parameters). It is given by Bayes Rule p(f |X)p(X) =p(X|f)p(f). Since p(X) does not depend on f , one can maxi-mize p(X|f)p(f) to obtain the so-calledMAP estimate.10 As arule of thumb, to translate regularized risk functionals intoBayesian MAP estimation schemes, all one has to do is toconsider exp(Rreg[f ]) = p(f |X). For a more detailed dis-cussion see e.g. [Kimeldorf and Wahba, 1970, MacKay, 1991,Neal, 1996, Rasmussen, 1996, Williams, 1998].A simple yet powerful way of model selection is cross val-

    idation. This is based on the idea that the expectation of theerror on a subset of the training sample not used during train-ing is identical to the expected error itself. There exist severalstrategies such as 10-fold crossvalidation, leave-one out error(-fold crossvalidation), bootstrap and derived algorithms toestimate the crossvalidation error itself. See e.g. [Stone, 1974,Wahba, 1980, Efron, 1982, Efron and Tibshirani, 1994, Wahba,1999, Jaakkola and Haussler, 1999] for further details.

    10Strictly speaking, in Bayesian estimation one is not so much concernedabout the maximizer f of p(f |X) but rather about the posterior distributionof f .

    14

  • Finally, one may also use uniform convergence boundssuch as the ones introduced by Vapnik and Chervonenkis[1971]. The basic idea is that one may bound with probability1 (with > 0) the expected riskR[f ] byRemp[f ]+(F, ),where is a condence term depending on the class of func-tions F. Several criteria for measuring the capacity of F exist,such as the VC-Dimensionwhich, in pattern recognition prob-lems, is given by the maximum number of points that can beseparated by the function class in all possible ways, the Cov-ering Numberwhich is the number of elements from F that areneeded to cover F with accuracy of at least , Entropy Numberswhich are the functional inverse of Covering Numbers, andmany more variants thereof. See e.g. [Vapnik, 1982, 1998, De-vroye et al., 1996, Williamson et al., 1998, Shawe-Taylor et al.,1998].

    8 Conclusion

    Due to the already quite large body of work done in the eldof SV research it is impossible to write a tutorial on SV regres-sion which includes all contributions to this eld. This alsowould be quite out of the scope of a tutorial and rather be rel-egated to textbooks on the matter (see [Scholkopf and Smola,2002] for a comprehensive overview, [Scholkopf et al., 1999a]for a snapshot of the current state of the art, [Vapnik, 1998]for an overview on statistical learning theory, or [Cristianiniand Shawe-Taylor, 2000] for an introductory textbook). Stillthe authors hope that this work provides a not overly biasedview of the state of the art in SV regression research. We de-liberately omitted (among others) the following topics.

    8.1 Missing Topics

    Mathematical Programming Starting from a completely dif-ferent perspective algorithms have been developed thatare similar in their ideas to SV machines. A good primermight be [Bradley et al., 1998]. Also see [Mangasarian,1965, 1969, Street and Mangasarian, 1995]. A compre-hensive discussion of connections between mathematicalprogramming and SV machines has been given by Ben-nett [1999].

    Density Estimation with SV machines [Weston et al., 1999,Vapnik, 1999]. There onemakes use of the fact that the cu-mulative distribution function is monotonically increas-ing, and that its values can be predicted with variablecondence which is adjusted by selecting different val-ues of in the loss function.

    Dictionaries were originally introduced in the context ofwavelets by Chen et al. [1999] to allow for a large class ofbasis functions to be considered simultaneously, e.g. ker-nels with different widths. In the standard SV case this ishardly possible except by dening new kernels as linearcombinations of differently scaled ones: choosing the reg-ularization operator already determines the kernel com-pletely [Kimeldorf andWahba, 1971, Cox and OSullivan,1990, Scholkopf et al., 2000]. Hence one has to resort tolinear programming [Weston et al., 1999].

    Applications The focus of this review was on methods andtheory rather than on applications. Thiswas done to limitthe size of the exposition. State of the art, or even recordperformancewas reported in [Muller et al., 1997, Druckeret al., 1997, Stitson et al., 1999, Mattera and Haykin, 1999].

    In many cases, it may be possible to achieve similar per-formance with neural network methods, however, onlyif many parameters are optimally tuned by hand, thusdepending largely on the skill of the experimenter. Cer-tainly, SV machines are not a silver bullet. However,as they have only few critical parameters (e.g. regular-ization and kernel width), state-of-the-art results can beachieved with relatively little effort.

    8.2 Open Issues

    Being a very active eld there exist still a number of open is-sues that have to be addressed by future research. After thatthe algorithmic development seems to have found a more sta-ble stage, one of the most important ones seems to be to ndtight error bounds derived from the specic properties of ker-nel functions. It will be of interest in this context, whether SVmachines, or similar approaches stemming from a linear pro-gramming regularizer, will lead to more satisfactory results.Moreover some sort of luckiness framework [Shawe-

    Taylor et al., 1998] for multiple model selection parameters,similar to multiple hyperparameters and automatic relevancedetection in Bayesian statistics [MacKay, 1991, Bishop, 1995],will have to be devised to make SV machines less dependenton the skill of the experimenter.It is also worth while to exploit the bridge between regu-

    larization operators, Gaussian processes and priors (see e.g.[Williams, 1998]) to state Bayesian risk bounds for SV ma-chines in order to compare the predictions with the ones fromVC theory. Optimization techniques developed in the contextof SV machines also could be used to deal with large datasetsin the Gaussian process settings.Prior knowledge appears to be another important ques-

    tion in SV regression. Whilst invariances could be includedin pattern recognition in a principled way via the virtual SVmechanism and restriction of the feature space [Burges andScholkopf, 1997, Scholkopf et al., 1998a], it is still not clearhow (probably) more subtle properties, as required for regres-sion, could be dealt with efciently.Reduced set methods also should be considered for speed-

    ing up prediction (and possibly also training) phase for largedatasets [Burges and Scholkopf, 1997, Osuna and Girosi, 1999,Scholkopf et al., 1999b, Smola and Scholkopf, 2000]. This topicis of great importance as data mining applications require al-gorithms that are able to deal with databases that are often atleast one order of magnitude larger (1 million samples) thanthe current practical size for SV regression.Many more aspects such as more data dependent general-

    ization bounds, efcient training algorithms, automatic kernelselection procedures, and many techniques that already havemade their way into the standard neural networks toolkit, willhave to be considered in the future.Readers who are tempted to embark upon a more detailed

    15

  • exploration of these topics, and to contribute their own ideasto this exciting eld,may nd it useful to consult the web pagewww.kernel-machines.org.

    Acknowledgements

    This work has been supported in part by a grant of the DFG(Ja 379/71, Sm 62/1). The authors thank Peter Bartlett, ChrisBurges, Stefan Harmeling, Olvi Mangasarian, Klaus-RobertMuller, Vladimir Vapnik, Jason Weston, Robert Williamson,and Andreas Ziehe for helpful discussions and comments.

    A Solving the Interior-Point Equations

    A.1 Path Following

    Rather than trying to satisfy (53) directly we will solve a mod-ied version thereof for some > 0 substituted on the rhs inthe rst place and decrease while iterating.

    gizi = , siti = for all i [1 . . . n]. (74)

    Still it is rather difcult to solve the nonlinear system of equa-tions (51), (52), and (74) exactly. However we are not in-terested in obtaining the exact solution to the approxima-tion (74). Instead, we seek a somewhat more feasible solu-tion for a given , then decrease and repeat. This can bedone by linearizing the above system and solving the result-ing equations by a predictorcorrector approach until the du-ality gap is small enough. The advantage is that we will getapproximately equal performance as by trying to solve thequadratic system directly, provided that the terms in 2 aresmall enough.

    A(+) = b+ g g = l++ t+t = u

    c+ 12q() +12

    2q() (A(y +y))

    +s+s = z +z

    (gi +gi)(zi +zi) = (si +si)(ti +ti) =

    Solving for the variables inwe get

    A = b A =: g = l + g =: +t = u t =: (Ay) +z s c (Ay) + s z 122q() = + 12q() =: g1zg +z = g1 z g1gz =: zt1st+s = t1 s t1ts =: s

    where g1 denotes the vector (1/g1, . . . , 1/gn), and t analo-gously. Moreover denote g1z and t1s the vector generatedby the componentwise product of the two vectors. Solving for

    g,t,z,swe get

    g = z1g(z z) z = g1z( )t = s1t(s s) s = t1s( )

    where := z1gz := s1ts

    (75)

    Now we can formulate the reduced KKTsystem (see [Vander-bei, 1994] for the quadratic case): H A

    A 0

    y

    =

    g1z t1s

    (76)

    whereH :=`12

    2q() + g

    1z + t1s.

    A.2 Iteration Strategies

    For the predictorcorrector method we proceed as follows. Inthe predictor step solve the system of (75) and (76) with = 0and all terms on the rhs set to 0, i.e. z = z, s = s. Thevalues in are substituted back into the denitions for zand s and (75) and (76) are solved again in the corrector step.As the quadratic part in (76) is not affected by the predictorcorrector steps, we only need to invert the quadratic matrixonce. This is done best by manually pivoting for the H part,as it is positive denite.Next the values in obtained by such an iteration step are

    used to update the corresponding values in , s, t, z, . . .. Toensure that the variables meet the positivity constraints, thesteplength is chosen such that the variables move at most1 of their initial distance to the boundaries of the positiveorthant. Usually [Vanderbei, 1994] one sets = 0.05.Another heuristic is used for computing , the parameter

    determining how much the KKTconditions should be en-forced. Obviously it is our aim to reduce as fast as possible,however if we happen to choose it too small, the conditionof the equations will worsen drastically. A setting that hasproven to work robustly is

    =g, z+ s, t

    2n

    1 + 10

    2. (77)

    The rationale behind (77) is to use the average of the satisfac-tion of the KKT conditions (74) as point of reference and thendecrease rapidly if we are far enough away from the bound-aries of the positive orthant, to which all variables (except y)are constrained to.Finally one has to come up with good initial values. Anal-

    ogously to [Vanderbei, 1994] we choose a regularized versionof (76) in order to determine the initial conditions. One solves ` 122q() + 1 A

    A 1

    y

    =

    cb

    (78)

    and subsequently restricts the solution to a feasible set

    x = max`x, u100

    g = min ( l, u)t = min (u , u)z = min

    12q () + c (Ay)

    + u100 , u

    s = min

    12q () c+ (Ay)

    + u100 , u

    (79)

    16

  • (.) denotes the Heavyside function, i.e. (x) = 1 for x > 0and (x) = 0 otherwise.

    A.3 Special considerations for SV regression

    The algorithm described so far can be applied to both SV pat-tern recognition and regression estimation. For the standardsetting in pattern recognition we have

    q() =X

    i,j=0

    ijyiyjk(xi, xj) (80)

    and consequently iq() = 0, 2ij q() = yiyjk(xi, xj),

    i.e. the Hessian is dense and the only thing we can dois compute its Cholesky factorization to compute (76). Inthe case of SV regression, however we have (with :=(1, . . . ,,

    1 , . . . ,

    ))

    q() =X

    i,j=1

    (ii )(jj )k(xi, xj)+2CX

    i=1

    T (i)+T (i )

    (81)and therefore

    iq() =ddi

    T (i)

    2ij q() = k(xi, xj) + ijd2

    d2iT (i)

    2ij q() = k(xi, xj)(82)

    and 2ij q(), 2ij

    q() analogously. Hence we are deal-

    ing with a matrix of typeM :=K +D KK K +D

    where

    D,D are diagonal matrices. By applying an orthogonal trans-formationM can be inverted essentially by inverting an matrix instead of a 2 2 system. This is exactly the ad-ditional advantage one can gain from implementing the op-timization algorithm directly instead of using a general pur-pose optimizer. One can show that for practical implemen-tations [Smola et al., 1998b] one can solve optimization prob-lems using nearly arbitrary convex cost functions as efcientlyas the special case of insensitive loss functions.Finally note that due to the fact that we are solving the pri-

    mal and dual optimization problem simultaneously we arealso computing parameters corresponding to the initial SV op-timization problem. This observation is useful as it allows usto obtain the constant term b directly, namely by setting b = y.See [Smola, 1998] for details.

    B Solving the Subset Selection Problem

    B.1 Subset Optimization Problem

    We will adapt the exposition of Joachims [1999] to the case ofregression with convex cost functions. Without loss of gener-ality we will assume = 0 and [0, C] (the other situationscan be treated as a special case). First wewill extract a reducedoptimization problem for the working set when all other vari-ables are kept xed. Denote Sw {1, . . . , } the working set

    and Sf := {1, . . . , }\Sw the xed set. Writing (43) as an opti-mization problem only in terms of Sw yields

    maximize

    8>>>>>>>: 12

    Pi,jSw

    (i i )(j j )xi, xj

    +PiSw

    (i i )yi P

    jSf(j j )xi, xj

    +PiSw

    ( (i + i ) + C (T (i) + T (i )))

    subject to

    ( PiSw

    (i i ) = PiSf

    (i i )i [0, C]

    (83)Hence we only have to update the linear term by the couplingwith the xed set P

    iSw(i i )

    PjSf

    (j j )xi, xj andthe equality constraint by P

    iSf(i i ). It is easy to see

    that maximizing (83) also decreases (43) by exactly the sameamount. If we choose variables for which the KKTconditionsare not satised the overall objective function tends to de-crease whilst still keeping all variables feasible. Finally it isbounded from below.Even though this does not prove convergence (unlike the

    statement in Osuna et al. [1997]) this algorithm proves veryuseful in practice. It is one of the fewmethods (besides [Kauf-man, 1999, Platt, 1999]) that can deal with problems whosequadratic part does not completely t into memory. Still inpractice one has to take special precautions to avoid stallingof convergence (recent results of Chang et al. [1999] indicatethat under certain conditions a proof of convergence is possi-ble). The crucial part is the one of Sw.

    B.2 A Note on Optimality

    For convenience the KKT conditions are repeated in aslightly modied form. Denote i the error made by the cur-rent estimate at sample xi, i.e.

    i := yi f(xi) = yi "

    mXj=1

    k(xi, xj)(i i ) + b#. (84)

    Rewriting the feasibility conditions (52) in terms of yields

    2iT (i) + i + si zi = 02i T (

    i ) + + i + s

    i zi = 0 (85)

    for all i {1, . . . ,m} with zi, zi , si, si 0. A set of dualfeasible variables z, s is given by

    zi = max (2iT (i) + i, 0)si = min (2iT (i) + i, 0)zi = max

    `2i T (

    i ) + + i, 0

    si = min

    `2i T (

    i ) + + i, 0

    (86)Consequently the KKT conditions (53) can be translated into

    izi = 0 and (C i)si = 0i z

    i = 0 and (C i )si = 0 (87)

    All variables i,i violating some of the conditions of (87)may be selected for further optimization. In most cases, espe-cially in the initial stage of the optimization algorithm, this set

    17

  • of patterns is much larger than any practical size of Sw. Un-fortunately [Osuna et al., 1997] contains little information onhow to select Sw. The heuristics presented here are an adapta-tion of [Joachims, 1999] to regression. See also [Lin, 2001] fordetails on optimization for SVR.

    B.3 Selection Rules

    Similarly to a merit function approach [El-Bakry et al., 1996]the idea is to select those variables that violate (85) and (87)most, thus contribute most to the feasibility gap. Hence onedenes a score variable i by

    i := gizi + siti= izi +

    i zi + (C i)si + (C i )si (88)

    By construction,P

    i i is the size of the feasibility gap (cf. (56)for the case of insensitive loss). By decreasing this gap, oneapproaches the the solution (upper bounded by the primalobjective and lower bounded by the dual objective function).Hence, the selection rule is to choose those patterns for whichi is largest. Some algorithms use

    i := i(zi) + i(z

    i )

    +(C i)(si) + (C i )(si)or i := (i)zi +(

    i )z

    i

    +(C i)si +(C i )si.(89)

    One can see that i = 0, i = 0, and i = 0 mutually imply

    each other. However, only i gives a measure for the contri-bution of the variable i to the size of the feasibility gap.Finally, note that heuristics like assigning stickyags (cf.

    [Burges, 1998]) to variables at the boundaries, thus effectivelysolving smaller subproblems, or completely removing the cor-respondingpatterns from the training set while accounting fortheir couplings [Joachims, 1999] can signicantly decrease thesize of the problem one has to solve and thus result in a no-ticeable speedup. Also caching [Joachims, 1999, Kowalczyk,2000] of already computed entries of the dot product matrixmay have a signicant impact on the performance.

    C Solving the SMO Equations

    C.1 Pattern Dependent Regularization

    Consider the constrained optimization problem (83) for twoindices, say (i, j). Pattern dependent regularization meansthat Ci may be different for every pattern (possibly even dif-ferent for i and i ). Since at most two variables may becomenonzero at the same time and moreover we are dealing with aconstrained optimization problemwemay express everythingin terms of just one variable. From the summation constraintwe obtain

    (ii )+(jj ) = (oldi i old)+(oldj j old) := (90)for regression. Exploiting ()j [0, C()j ] yields ()i [L,H ]This is taking account of the fact that there may

    be only four different pairs of nonzero variables:(i,j), (

    i ,j), (i,

    j ), and (

    i ,

    j ). For convenience

    dene an auxiliary variables s such that s = 1 in the rst andthe last case and s = 1 otherwise.

    j j

    iLH

    max(0, Cj)min(Ci, )

    max(0, )min(Ci, C

    j + )

    iLH

    max(0,)min(Ci , + Cj)

    max(0, Cj )min(Ci ,)

    C.2 Analytic Solution for Regression

    Next one has to solve the optimization problem analytically.We make use of (84) and substitute the values of i into thereduced optimization problem (83). In particular we use

    yiXj Sw

    (ii )Kij = i+b+XjSw

    (oldi i old)Kij . (91)

    Moreover with the auxiliary variables = i i + j jand := (Kii + Kjj 2Kij) one obtains the following con-strained optimization problem in i (after eliminating j, ig-noring terms independent of j ,j and noting that this onlyholds for ii = jj = 0):

    maximize 12 (i i )2 (i + i )(1 s)+(i i )(i j + (oldi i old))

    subject to ()i [L(),H()].(92)

    The unconstrained maximum of (92) with respect to i or ican be found below.

    (I) i,j oldi + 1(i j)

    (II) i,j oldi +

    1(i j 2)(III) i ,j

    iold 1(i j + 2)

    (IV) i ,j iold 1(i j)

    The problem is that we do not know beforehand which of thefour quadrants (I)-(IV) contains the solution. However, byconsidering the sign of we can distinguish two cases: for > 0 only (I)-(III) are possible, for < 0 the coefcients sat-isfy one of the cases (II)-(IV). In case of = 0 only (II) and (III)have to be considered. See also the diagram below.

    IV

    III I

    i

    j

    j

    j

    > 0

    < 0

    II

    For > 0 it is best to start with quadrant (I), test whether theunconstrained solution hits one of the boundaries L,H andif so, probe the corresponding adjacent quadrant (II) or (III). < 0 can be dealt with analogously.Due to numerical instabilities, it may happen that < 0. In

    that case should be set to 0 and one has to solve (92) in alinear fashion directly.11

    11Negative values of are theoretically impossible since k satises Mercerscondition: 0 (xi) (xj)2 = Kii +Kjj 2Kij = .

    18

  • C.3 Selection Rule for Regression

    Finally, one has to pick indices (i, j) such that the objectivefunction is maximized. Again, the reasoning of SMO [Platt,1999, sec. 12.2.2] for classication will be mimicked. Thismeans that a two loop approach is chosen to maximize theobjective function. The outer loop iterates over all patternsviolating the KKT conditions, rst only over those with La-grange multipliers neither on the upper nor lower boundary,and once all of them are satised, over all patterns violatingthe KKT conditions, to ensure self consistency on the completedataset.12 This solves the problem of choosing i.Now for j: To make a large step towards the minimum, one

    looks for large steps in i. As it is computationally expen-sive to compute for all possible pairs (i, j) one chooses theheuristic to maximize the absolute value of the numerator inthe expressions for i and i , i.e. |ij | and |ij 2|.The index j corresponding to the maximum absolute value ischosen for this purpose.If this heuristic happens to fail, in other words if little

    progress is made by this choice, all other indices j are lookedat (this is what is called second choice hierarcy in [Platt,1999]) in the following way:

    1. All indices j corresponding to nonbound examples arelooked at, searching for an example to make progress on.

    2. In the case that the rst heuristic was unsuccessful, allother samples are analyzed until an example is foundwhere progress can be made.

    3. If both previous steps fail proceed to the next i.

    For a more detailed discussion see [Platt, 1999]. Unlike inte-rior point algorithms SMO does not automatically provide avalue for b. However this can be chosen like in section 1.4 byhaving a close look at the Lagrange multipliers ()i obtained.

    C.4 Stopping Criteria

    By essentially minimizing a constrained primal op-timization problem one cannot ensure that the dualobjective function increases with every iterationstep.13 Nevertheless one knows that the minimumvalue of the objective function lies in the interval[dual objectivei,primal objectivei] for all steps i, hence also

    in the intervalh(maxji dual objectivej),primal objectivei

    i.

    One uses the latter to determine the quality of the currentsolution.

    12It is sometimes useful, especially when dealing with noisy data, to iterateover the complete KKT violating dataset already before complete self consis-tency on the subset has been achieved. Otherwise much computational re-sources are spent on making subsets self consistent that are not globally selfconsistent. This is the reason why in the pseudo code a global loop is initiatedalready when only less than 10% of the non bound variables changed.

    13It is still an open question how a subset selection optimization algorithmcould be devised that decreases both primal and dual objective function at thesame time. The problem is that this usually involves a number of dual variablesof the order of the sample size, which makes this attempt unpractical.

    The calculation of the primal objective function from theprediction errors is straightforward. One usesXi,j

    (ii )(j j )kij = Xi

    (ii )(i+yi b), (93)

    i.e. the denition of i to avoid the matr