\begin{thebibliography}{} \bibitem[Akaike, 1970]{Aka:1969} Akaike, H. (1970). \newblock Statistical predictor identification. \newblock {\em Ann. Inst. Statist. Math.}, 22:203--217. \bibitem[Akaike, 1973]{Aka:1973} Akaike, H. (1973). \newblock Information theory and an extension of the maximum likelihood principle. \newblock In {\em Second International Symposium on Information Theory (Tsahkadsor, 1971)}, pages 267--281. Akad\'emiai Kiad\'o, Budapest. \bibitem[Allen, 1974]{All:1974} Allen, D.~M. (1974). \newblock The relationship between variable selection and data augmentation and a method for prediction. \newblock {\em Technometrics}, 16:125--127. \bibitem[Alpaydin, 1999]{Alp:1999} Alpaydin, E. (1999). \newblock Combined 5 x 2 cv {F} test for comparing supervised classification learning algorithms. \newblock {\em Neur. Comp.}, 11(8):1885--1892. \bibitem[Anderson et~al., 1972]{AnAB72} Anderson, R.~L., Allen, D.~M., and B., C.~F. (1972). \newblock Selectioon of predictor variables in linear multiple regression. \newblock In bancroft, T.~A., editor, {\em In Statistical papers in Honor of George W. Snedecor}. Iowa: iowa State University Press. \bibitem[Arlot, 2007]{Arlo07} Arlot, S. (2007). \newblock {\em Resampling and Model Selection}. \newblock PhD thesis, University Paris-Sud 11. \newblock oai:tel.archives-ouvertes.fr:tel-00198803\_v1. \bibitem[Arlot, 2008a]{Arlo08c} Arlot, S. (2008a). \newblock Model selection by resampling penalization. \newblock {\em Electronic Journal of Statistics}. \newblock To appear. oai:hal.archives-ouvertes.fr:hal-00262478\_v1. \bibitem[Arlot, 2008b]{Arlo08} Arlot, S. (2008b). \newblock Suboptimality of penalties proportional to the dimension for model selection in heteroscedastic regression. \newblock arXiv:0812.3141. \bibitem[Arlot, 2008c]{Arl:2008a} Arlot, S. (2008c). \newblock {$V$}-fold cross-validation improved: {$V$}-fold penalization. \newblock arXiv:0802.0566v2. \bibitem[Arlot and Celisse, 2009]{ArCe09} Arlot, S. and Celisse, A. (2009). \newblock {S}egmentation in the mean of heteroscedastic data via cross-validation. \newblock arXiv:0902.3977v2. \bibitem[Arlot and Massart, 2009]{Arl_Mas:2009:pente} Arlot, S. and Massart, P. (2009). \newblock Data-driven calibration of penalties for least-squares regression. \newblock {\em J. Mach. Learn. Res.}, 10:245--279 (electronic). \bibitem[Baraud, 2002]{Bar:2002} Baraud, Y. (2002). \newblock Model selection for regression on a random design. \newblock {\em ESAIM Probab. Statist.}, 6:127--146 (electronic). \bibitem[Barron et~al., 1999]{Bar_Bir_Mas:1999} Barron, A., Birg{\'e}, L., and Massart, P. (1999). \newblock Risk bounds for model selection via penalization. \newblock {\em Probab. Theory Related Fields}, 113(3):301--413. \bibitem[Bartlett et~al., 2002]{Bar_Bou_Lug:2002} Bartlett, P.~L., Boucheron, S., and Lugosi, G. (2002). \newblock Model selection and error estimation. \newblock {\em Machine Learning}, 48:85--113. \bibitem[Bartlett et~al., 2005]{Bar_Bou_Men:2005} Bartlett, P.~L., Bousquet, O., and Mendelson, S. (2005). \newblock Local {R}ademacher complexities. \newblock {\em Ann. Statist.}, 33(4):1497--1537. \bibitem[Bartlett and Mendelson, 2002]{Bar_Men:2002} Bartlett, P.~L. and Mendelson, S. (2002). \newblock Rademacher and {G}aussian complexities: risk bounds and structural results. \newblock {\em J. Mach. Learn. Res.}, 3(Spec. Issue Comput. Learn. Theory):463--482. \bibitem[Bellman and Dreyfus, 1962]{BeDr62} Bellman, R.~E. and Dreyfus, S.~E. (1962). \newblock {\em {A}pplied {D}ynamic {P}rogramming}. \newblock Princeton. \bibitem[Bengio and Grandvalet, 2004]{Ben_Gra:2004} Bengio, Y. and Grandvalet, Y. (2004). \newblock No unbiased estimator of the variance of {$K$}-fold cross-validation. \newblock {\em J. Mach. Learn. Res.}, 5:1089--1105 (electronic). \bibitem[Bhansali and Downham, 1977]{Bha_Dow:1977} Bhansali, R.~J. and Downham, D.~Y. (1977). \newblock Some properties of the order of an autoregressive model selected by a generalization of {A}kaike's {FPE} criterion. \newblock {\em Biometrika}, 64(3):547--551. \bibitem[Birg{\'e} and Massart, 2001]{Bir_Mas:2002} Birg{\'e}, L. and Massart, P. (2001). \newblock Gaussian model selection. \newblock {\em J. Eur. Math. Soc. (JEMS)}, 3(3):203--268. \bibitem[Birg{\'e} and Massart, 2007]{Bir_Mas:2006} Birg{\'e}, L. and Massart, P. (2007). \newblock Minimal penalties for {G}aussian model selection. \newblock {\em Probab. Theory Related Fields}, 138(1-2):33--73. \bibitem[Blanchard and Massart, 2006]{Bla_Mas:2006} Blanchard, G. and Massart, P. (2006). \newblock Discussion: ``{L}ocal {R}ademacher complexities and oracle inequalities in risk minimization'' [{A}nn. {S}tatist. {\bf 34} (2006), no. 6, 2593--2656] by {V}. {K}oltchinskii. \newblock {\em Ann. Statist.}, 34(6):2664--2671. \bibitem[Boucheron et~al., 2005]{Bou_Bou_Lug:2005} Boucheron, S., Bousquet, O., and Lugosi, G. (2005). \newblock Theory of classification: a survey of some recent advances. \newblock {\em ESAIM Probab. Stat.}, 9:323--375 (electronic). \bibitem[Bousquet and Elisseff, 2002]{BoEl02} Bousquet, O. and Elisseff, A. (2002). \newblock {S}tability and {G}eneralization. \newblock {\em J. Machine Learning Research}, 2:499--526. \bibitem[Bowman, 1984]{Bowm84} Bowman, A.~W. (1984). \newblock {A}n alternative method of cross-validation for the smoothing of density estimates. \newblock {\em Biometrika}, 71(2):353--360. \bibitem[Breiman, 1996]{Bre:1996} Breiman, L. (1996). \newblock Heuristics of instability and stabilization in model selection. \newblock {\em Ann. Statist.}, 24(6):2350--2383. \bibitem[Breiman, 1998]{Brei96c} Breiman, L. (1998). \newblock Arcing classifiers. \newblock {\em Ann. Statist.}, 26(3):801--849. \newblock With discussion and a rejoinder by the author. \bibitem[Breiman et~al., 1984]{Bre_etal:1984} Breiman, L., Friedman, J.~H., Olshen, R.~A., and Stone, C.~J. (1984). \newblock {\em Classification and regression trees}. \newblock Wadsworth Statistics/Probability Series. Wadsworth Advanced Books and Software, Belmont, CA. \bibitem[Breiman and Spector, 1992]{Bre_Spe:1992} Breiman, L. and Spector, P. (1992). \newblock Submodel selection and evaluation in regression. the x-random case. \newblock {\em International Statistical Review}, 60(3):291--319. \bibitem[Burman, 1989]{Bur:1989} Burman, P. (1989). \newblock A comparative study of ordinary cross-validation, {$v$}-fold cross-validation and the repeated learning-testing methods. \newblock {\em Biometrika}, 76(3):503--514. \bibitem[Burman, 1990]{Bur:1990} Burman, P. (1990). \newblock Estimation of optimal transformations using {$v$}-fold cross validation and repeated learning-testing methods. \newblock {\em Sankhy\=a Ser. A}, 52(3):314--345. \bibitem[Burman et~al., 1994]{Bur_Cho_Nol:1994} Burman, P., Chow, E., and Nolan, D. (1994). \newblock A cross-validatory method for dependent data. \newblock {\em Biometrika}, 81(2):351--358. \bibitem[Burman and Nolan, 1992]{Bur_Nol:1992} Burman, P. and Nolan, D. (1992). \newblock Data-dependent estimation of prediction functions. \newblock {\em J. Time Ser. Anal.}, 13(3):189--207. \bibitem[Burnham and Anderson, 2002]{Bur_And:2002} Burnham, K.~P. and Anderson, D.~R. (2002). \newblock {\em Model selection and multimodel inference}. \newblock Springer-Verlag, New York, second edition. \newblock A practical information-theoretic approach. \bibitem[Cao and Golubev, 2006]{Cao_Gol:2006} Cao, Y. and Golubev, Y. (2006). \newblock On oracle inequalities related to smoothing splines. \newblock {\em Math. Methods Statist.}, 15(4):398--414. \bibitem[Celisse, 2008a]{Celi08} Celisse, A. (2008a). \newblock {D}ensity estimation via cross-validation: Model selection point of view. \newblock Technical report, arXiv: 0811.0802. \bibitem[Celisse, 2008b]{Cel:2008:phd} Celisse, A. (2008b). \newblock {\em {M}odel Selection Via Cross-Validation in Density Estimation, Regression and Change-Points Detection}. \newblock PhD thesis, University Paris-Sud 11, \texttt{http://tel.archives-ouvertes.fr/tel-00346320/en/}. \bibitem[Celisse and Robin, 2008]{CeRo08} Celisse, A. and Robin, S. (2008). \newblock {N}onparametric density estimation by exact leave-p-out cross-validation. \newblock {\em Computational Statistics and Data Analysis}, 52(5):2350--2368. \bibitem[Chow et~al., 1987]{ChGW87} Chow, Y.~S., Geman, S., and Wu, L.~D. (1987). \newblock Consistent cross-validated density estimation. \newblock {\em Ann. Statist.}, 11:25--38. \bibitem[Chu and Marron, 1991]{Chu_Mar:1991} Chu, C.-K. and Marron, J.~S. (1991). \newblock Comparison of two bandwidth selectors with dependent errors. \newblock {\em Ann. Statist.}, 19(4):1906--1918. \bibitem[Craven and Wahba, 1979]{Cra_Wah:1979} Craven, P. and Wahba, G. (1979). \newblock Smoothing noisy data with spline functions. {E}stimating the correct degree of smoothing by the method of generalized cross-validation. \newblock {\em Numer. Math.}, 31(4):377--403. \bibitem[Dalelane, 2005]{Dale05} Dalelane, C. (2005). \newblock Exact oracle inequality for sharp adaptive kernel density estimator. \newblock Technical report, arXiv. \bibitem[Daudin and Mary-Huard, 2008]{DaMa08} Daudin, J.-J. and Mary-Huard, T. (2008). \newblock {E}stimation of the conditional risk in classification: {T}he swapping method. \newblock {\em Comput. Stat. Data Anal.}, 52(6):3220--3232. \bibitem[Davison and Hall, 1992]{Dav_Hal:1992} Davison, A.~C. and Hall, P. (1992). \newblock On the bias and variability of bootstrap and cross-validation estimates of error rate in discrimination problems. \newblock {\em Biometrika}, 79(2):279--284. \bibitem[Devroye et~al., 1996]{Dev_Gyo_Lug:1996} Devroye, L., Gy{\"o}rfi, L., and Lugosi, G. (1996). \newblock {\em A probabilistic theory of pattern recognition}, volume~31 of {\em Applications of Mathematics (New York)}. \newblock Springer-Verlag, New York. \bibitem[Devroye and Wagner, 1979]{DeWa79} Devroye, L. and Wagner, T.~J. (1979). \newblock {D}istribution-{F}ree performance {B}ounds for {P}otential {F}unction {R}ules. \newblock {\em IEEE Transaction in Information Theory}, 25(5):601--604. \bibitem[Dietterich, 1998]{Die:1998} Dietterich, T.~G. (1998). \newblock Approximate statistical tests for comparing supervised classification learning algorithms. \newblock {\em Neur. Comp.}, 10(7):1895--1924. \bibitem[Efron, 1983]{Efr:1983} Efron, B. (1983). \newblock Estimating the error rate of a prediction rule: improvement on cross-validation. \newblock {\em J. Amer. Statist. Assoc.}, 78(382):316--331. \bibitem[Efron, 1986]{Efr:1986} Efron, B. (1986). \newblock How biased is the apparent error rate of a prediction rule? \newblock {\em J. Amer. Statist. Assoc.}, 81(394):461--470. \bibitem[Efron, 2004]{Efr:2004} Efron, B. (2004). \newblock The estimation of prediction error: covariance penalties and cross-validation. \newblock {\em J. Amer. Statist. Assoc.}, 99(467):619--642. \newblock With comments and a rejoinder by the author. \bibitem[Efron and Morris, 1973]{EfMo73} Efron, B. and Morris, C. (1973). \newblock Combining possibly related estimation problems (with discussion). \newblock {\em J. R. Statist. Soc. B}, 35:379. \bibitem[Efron and Tibshirani, 1997]{Efr_Tib:1997} Efron, B. and Tibshirani, R. (1997). \newblock Improvements on cross-validation: the .632+ bootstrap method. \newblock {\em J. Amer. Statist. Assoc.}, 92(438):548--560. \bibitem[Fromont, 2007]{Fro:2007} Fromont, M. (2007). \newblock Model selection by bootstrap penalization for classification. \newblock {\em Mach. Learn.}, 66(2--3):165--207. \bibitem[Geisser, 1974]{Geis74} Geisser, S. (1974). \newblock {A} predictive approach to the random effect model. \newblock {\em Biometrika}, 61(1):101--107. \bibitem[Geisser, 1975]{Gei:1975} Geisser, S. (1975). \newblock The predictive sample reuse method with applications. \newblock {\em J. Amer. Statist. Assoc.}, 70:320--328. \bibitem[Girard, 1998]{Gir:1998} Girard, D.~A. (1998). \newblock Asymptotic comparison of (partial) cross-validation, {GCV} and randomized {GCV} in nonparametric regression. \newblock {\em Ann. Statist.}, 26(1):315--334. \bibitem[Gr{\"u}nwald, 2007]{Gru:2007} Gr{\"u}nwald, P.~D. (2007). \newblock {\em The {M}inimum {D}escription {L}ength {P}rinciple}. \newblock MIT Press, Cambridge, MA, USA. \bibitem[Gy{\"o}rfi et~al., 2002]{Gyo_etal:2002} Gy{\"o}rfi, L., Kohler, M., Krzy{\.z}ak, A., and Walk, H. (2002). \newblock {\em A distribution-free theory of nonparametric regression}. \newblock Springer Series in Statistics. Springer-Verlag, New York. \bibitem[Hall, 1983]{Hal:1983} Hall, P. (1983). \newblock Large sample optimality of least squares cross-validation in density estimation. \newblock {\em Ann. Statist.}, 11(4):1156--1174. \bibitem[Hall, 1987]{Hall87} Hall, P. (1987). \newblock {O}n {K}ullback-{L}eibler loss and density estimation. \newblock {\em The Annals of Statistics}, 15(4):1491--1519. \bibitem[Hall et~al., 1995]{Hal_Lah_Pol:1995} Hall, P., Lahiri, S.~N., and Polzehl, J. (1995). \newblock On bandwidth choice in nonparametric regression with both short- and long-range dependent errors. \newblock {\em Ann. Statist.}, 23(6):1921--1936. \bibitem[Hall et~al., 1992]{Hal_Mar_Par:1992} Hall, P., Marron, J.~S., and Park, B.~U. (1992). \newblock Smoothed cross-validation. \newblock {\em Probab. Theory Related Fields}, 92(1):1--20. \bibitem[Hall and Schucany, 1989]{Hal_Sch:1989} Hall, P. and Schucany, W.~R. (1989). \newblock A local cross-validation algorithm. \newblock {\em Statist. Probab. Lett.}, 8(2):109--117. \bibitem[H{\"a}rdle, 1984]{Haerd84} H{\"a}rdle, W. (1984). \newblock How to determine the bandwidth of some nonlinear smoothers in practice. \newblock In {\em Robust and nonlinear time series analysis ({H}eidelberg, 1983)}, volume~26 of {\em Lecture Notes in Statist.}, pages 163--184. Springer, New York. \bibitem[H{\"a}rdle et~al., 1988]{Har_Hal_Mar:1988} H{\"a}rdle, W., Hall, P., and Marron, J.~S. (1988). \newblock How far are automatically chosen regression smoothing parameters from their optimum? \newblock {\em J. Amer. Statist. Assoc.}, 83(401):86--101. \newblock With comments by David W. Scott and Iain Johnstone and a reply by the authors. \bibitem[Hart, 1994]{Har:1994} Hart, J.~D. (1994). \newblock Automated kernel smoothing of dependent data by using time series cross-validation. \newblock {\em J. Roy. Statist. Soc. Ser. B}, 56(3):529--542. \bibitem[Hart and Vieu, 1990]{Har_Vie:1990} Hart, J.~D. and Vieu, P. (1990). \newblock Data-driven bandwidth choice for density estimation based on dependent data. \newblock {\em Ann. Statist.}, 18(2):873--890. \bibitem[Hart and Wehrly, 1986]{Har_Weh:1986} Hart, J.~D. and Wehrly, T.~E. (1986). \newblock Kernel regression estimation using repeated measurements data. \newblock {\em J. Amer. Statist. Assoc.}, 81(396):1080--1088. \bibitem[Hastie et~al., 2001]{Has_Tib_Fri:2001} Hastie, T., Tibshirani, R., and Friedman, J. (2001). \newblock {\em The elements of statistical learning}. \newblock Springer Series in Statistics. Springer-Verlag, New York. \newblock Data mining, inference, and prediction. \bibitem[Herzberg and Tsukanov, 1986]{HeTs86} Herzberg, A.~M. and Tsukanov, A.~V. (1986). \newblock A note on modifications of jackknife criterion for model selection. \newblock {\em Utilitas Math.}, 29:209--216. \bibitem[Herzberg, 1969]{Herz69} Herzberg, P.~A. (1969). \newblock {T}he parameters of cross-validation. \newblock {\em Psychometrika}, 34:Monograph Supplement. \bibitem[Hesterberg et~al., 2008]{Hes_etal:2008} Hesterberg, T.~C., Choi, N.~H., Meier, L., and Fraley, C. (2008). \newblock Least angle and l1 penalized regression: A review. \newblock {\em Statistics Surveys}, 2:61--93 (electronic). \bibitem[Hills, 1966]{Hill66} Hills, M. (1966). \newblock {A}llocation {R}ules and their {E}rror {R}ates. \newblock {\em J. Royal Statist. Soc. Series B}, 28(1):1--31. \bibitem[Huber, 1964]{Hube64} Huber, P. (1964). \newblock Robust estimation of a local parameter. \newblock {\em Ann. Math. Statist.}, 35:73--101. \bibitem[Hurvich and Tsai, 1989]{Hur_Tsa:1989} Hurvich, C.~M. and Tsai, C.-L. (1989). \newblock Regression and time series model selection in small samples. \newblock {\em Biometrika}, 76(2):297--307. \bibitem[John, 1971]{Joh:1971} John, P. W.~M. (1971). \newblock {\em Statistical design and analysis of experiments}. \newblock The Macmillan Co., New York. \bibitem[Jonathan et~al., 2000]{JoKM00} Jonathan, P., Krzanowki, W.~J., and McCarthy, W.~V. (2000). \newblock On the use of cross-validation to assess performance in multivariate prediction. \newblock {\em Stat. and Comput.}, 10:209--229. \bibitem[Kearns et~al., 1997]{KMNR97} Kearns, M., Mansour, Y., Ng, A.~Y., and Ron, D. (1997). \newblock {A}n {E}xperimental and {T}heoretical {C}omparison of {M}odel {S}election {M}ethods. \newblock {\em Machine Learning}, 27:7--50. \bibitem[Kearns and Ron, 1999]{KeRo99} Kearns, M. and Ron, D. (1999). \newblock {A}lgorithmic {S}tability and {S}anity-{C}heck {B}ounds for {L}eave-{O}ne-{O}ut {C}ross-{V}alidation. \newblock {\em Neural Computation}, 11:1427--1453. \bibitem[Koltchinskii, 2001]{Kol:2001} Koltchinskii, V. (2001). \newblock Rademacher penalties and structural risk minimization. \newblock {\em IEEE Trans. Inform. Theory}, 47(5):1902--1914. \bibitem[Koltchinskii, 2006]{Kol:2006} Koltchinskii, V. (2006). \newblock Local {R}ademacher complexities and oracle inequalities in risk minimization. \newblock {\em Ann. Statist.}, 34(6):2593--2656. \bibitem[Lachenbruch and Mickey, 1968]{LaMi68} Lachenbruch, P.~A. and Mickey, M.~R. (1968). \newblock {E}stimation of {E}rror {R}ates in {D}iscriminant {A}nalysis. \newblock {\em Technometrics}, 10(1):1--11. \bibitem[Larson, 1931]{Lars31} Larson, S.~C. (1931). \newblock {T}he shrinkage of the coefficient of multiple correlation. \newblock {\em J. Edic. Psychol.}, 22:45--55. \bibitem[Leung et~al., 1993]{LeMW93} Leung, D., Marriott, F., and Wu, E. (1993). \newblock Bandwidth selection in robust smoothing. \newblock {\em J. Nonparametr. Statist.}, 2:333--339. \bibitem[Leung, 2005]{Leu:2005} Leung, D. H.-Y. (2005). \newblock Cross-validation in nonparametric regression with outliers. \newblock {\em Ann. Statist.}, 33(5):2291--2310. \bibitem[Li, 1985]{Li85} Li, K.-C. (1985). \newblock From stein's unbiased risk estimates to the method of generalized cross validation. \newblock {\em Ann. Statist.}, 13(4):1352--1377. \bibitem[Li, 1987]{KCLi:1987} Li, K.-C. (1987). \newblock Asymptotic optimality for {$C\sb p$}, {$C\sb L$}, cross-validation and generalized cross-validation: discrete index set. \newblock {\em Ann. Statist.}, 15(3):958--975. \bibitem[Mallows, 1973]{Mal:1973} Mallows, C.~L. (1973). \newblock Some comments on ${C}_p$. \newblock {\em Technometrics}, 15:661--675. \bibitem[Markatou et~al., 2005]{Mar_etal:2005} Markatou, M., Tian, H., Biswas, S., and Hripcsak, G. (2005). \newblock Analysis of variance of cross-validation estimators of the generalization error. \newblock {\em J. Mach. Learn. Res.}, 6:1127--1168 (electronic). \bibitem[Massart, 2007]{Mas:2003:St-Flour} Massart, P. (2007). \newblock {\em Concentration inequalities and model selection}, volume 1896 of {\em Lecture Notes in Mathematics}. \newblock Springer, Berlin. \newblock Lectures from the 33rd Summer School on Probability Theory held in Saint-Flour, July 6--23, 2003, With a foreword by Jean Picard. \bibitem[Molinaro et~al., 2005]{Mol_Sim_Pfe:2005} Molinaro, A.~M., Simon, R., and Pfeiffer, R.~M. (2005). \newblock Prediction error estimation: a comparison of resampling methods. \newblock {\em Bioinformatics}, 21(15):3301--3307. \bibitem[Mosteller and Tukey, 1968]{MoTu68} Mosteller, F. and Tukey, J.~W. (1968). \newblock {D}ata analysis, including statistics. \newblock In Lindzey, G. and Aronson, E., editors, {\em {H}andbook of {S}ocial {P}sychology, {V}ol. 2}. Addison-Wesley. \bibitem[Nadeau and Bengio, 2003]{NaBe03} Nadeau, C. and Bengio, Y. (2003). \newblock Inference for the generalization error. \newblock {\em Machine Learning}, 52:239--281. \bibitem[Nishii, 1984]{Nis:1984} Nishii, R. (1984). \newblock Asymptotic properties of criteria for selection of variables in multiple regression. \newblock {\em Ann. Statist.}, 12(2):758--765. \bibitem[Opsomer et~al., 2001]{Ops_Wan_Yan:2001} Opsomer, J., Wang, Y., and Yang, Y. (2001). \newblock Nonparametric regression with correlated errors. \newblock {\em Statist. Sci.}, 16(2):134--153. \bibitem[Picard and Cook, 1984]{Pic_Coo:1984} Picard, R.~R. and Cook, R.~D. (1984). \newblock Cross-validation of regression models. \newblock {\em J. Amer. Statist. Assoc.}, 79(387):575--583. \bibitem[Politis et~al., 1999]{Pol_Rom_Wol:1999} Politis, D.~N., Romano, J.~P., and Wolf, M. (1999). \newblock {\em Subsampling}. \newblock Springer Series in Statistics. Springer-Verlag, New York. \bibitem[Quenouille, 1949]{Que:1949} Quenouille, M.~H. (1949). \newblock Approximate tests of correlation in time-series. \newblock {\em J. Roy. Statist. Soc. Ser. B.}, 11:68--84. \bibitem[Ripley, 1996]{Ripl96} Ripley, B.~D. (1996). \newblock {\em Pattern Recognition and Neural Networks}. \newblock Cambridge Univ. Press. \bibitem[Rissanen, 1983]{Riss83} Rissanen, J. (1983). \newblock {U}niversal {P}rior for {I}ntegers and {E}stimation by {M}inimum {D}escription {L}ength. \newblock {\em The Annals of Statistics}, 11(2):416--431. \bibitem[Ronchetti et~al., 1997]{RoFB97} Ronchetti, E., Field, C., and Blanchard, W. (1997). \newblock Robust linear model selection by cross-validation. \newblock {\em J. Amer. Statist. Assoc.}, 92:1017--1023. \bibitem[Rudemo, 1982]{Rude82} Rudemo, M. (1982). \newblock {E}mpirical {C}hoice of {H}istograms and {K}ernel {D}ensity {E}stimators. \newblock {\em Scandinavian Journal of Statistics}, 9:65--78. \bibitem[Sauv\'e, 2009]{Sau:2006} Sauv\'e, M. (2009). \newblock Histogram selection in non gaussian regression. \newblock {\em ESAIM: Probability and Statistics}, 13:70--86. \bibitem[Schuster and Gregory, 1981]{ScGr81} Schuster, E.~F. and Gregory, G.~G. (1981). \newblock On the consistency of maximum likelihood nonparametric density estimators. \newblock In Eddy, W.~F., editor, {\em Computer Science and Statistics: Proceedings of the 13th Symposium on the Interface}, pages 295--298. Springer-Verlag, New York. \bibitem[Schwarz, 1978]{Sch:1978} Schwarz, G. (1978). \newblock Estimating the dimension of a model. \newblock {\em Ann. Statist.}, 6(2):461--464. \bibitem[Shao, 1993]{Sha:1993} Shao, J. (1993). \newblock Linear model selection by cross-validation. \newblock {\em J. Amer. Statist. Assoc.}, 88(422):486--494. \bibitem[Shao, 1996]{Sha:1996} Shao, J. (1996). \newblock Bootstrap model selection. \newblock {\em J. Amer. Statist. Assoc.}, 91(434):655--665. \bibitem[Shao, 1997]{Sha:1997} Shao, J. (1997). \newblock An asymptotic theory for linear model selection. \newblock {\em Statist. Sinica}, 7(2):221--264. \newblock With comments and a rejoinder by the author. \bibitem[Shibata, 1984]{Shi:1984} Shibata, R. (1984). \newblock Approximate efficiency of a selection procedure for the number of regression variables. \newblock {\em Biometrika}, 71(1):43--49. \bibitem[Simon, 1971]{Simo71} Simon, F. (1971). \newblock {P}rediction methods in criminology. \newblock volume~7. \bibitem[Stone, 1984]{Ston84} Stone, C. (1984). \newblock {A}n asymptotically optimal window selection rule for kernel density estimates. \newblock {\em The Annals of Statistics}, 12(4):1285--1297. \bibitem[Stone, 1974]{Sto:1974} Stone, M. (1974). \newblock Cross-validatory choice and assessment of statistical predictions. \newblock {\em J. Roy. Statist. Soc. Ser. B}, 36:111--147. \newblock With discussion by G. A. Barnard, A. C. Atkinson, L. K. Chan, A. P. Dawid, F. Downton, J. Dickey, A. G. Baker, O. Barndorff-Nielsen, D. R. Cox, S. Giesser, D. Hinkley, R. R. Hocking, and A. S. Young, and with a reply by the authors. \bibitem[Stone, 1977]{Sto:1977b} Stone, M. (1977). \newblock Asymptotics for and against cross-validation. \newblock {\em Biometrika}, 64(1):29--35. \bibitem[Sugiura, 1978]{Sug:1978} Sugiura, N. (1978). \newblock Further analysis of the data by akaike's information criterion and the finite corrections. \newblock {\em Comm. Statist. A---Theory Methods}, 7(1):13--26. \bibitem[Tibshirani, 1996]{Tibs96} Tibshirani, R. (1996). \newblock {R}egression {S}hrinkage and {S}election via the {L}asso. \newblock {\em J. Royal Statist. Soc. Series B}, 58(1):267--288. \bibitem[van~der Laan and Dudoit, 2003]{vdL_Dud:2003} van~der Laan, M.~J. and Dudoit, S. (2003). \newblock Unified cross-validation methodology for selection among estimators and a general cross-validated adaptive epsilon-net estimator: Finite sample oracle inequalities and examples. \newblock Working Paper Series Working Paper 130, U.C. Berkeley Division of Biostatistics. \newblock available at http://www.bepress.com/ucbbiostat/paper130. \bibitem[van~der Laan et~al., 2004]{vdL_Dud_Kel:2004} van~der Laan, M.~J., Dudoit, S., and Keles, S. (2004). \newblock Asymptotic optimality of likelihood-based cross-validation. \newblock {\em Stat. Appl. Genet. Mol. Biol.}, 3:Art. 4, 27 pp. (electronic). \bibitem[van~der Laan et~al., 2006]{vdL_Dud_vdV:2006} van~der Laan, M.~J., Dudoit, S., and van~der Vaart, A.~W. (2006). \newblock The cross-validated adaptive epsilon-net estimator. \newblock {\em Statist. Decisions}, 24(3):373--395. \bibitem[van~der Vaart et~al., 2006]{vdV_Dud_vdL:2006} van~der Vaart, A.~W., Dudoit, S., and van~der Laan, M.~J. (2006). \newblock Oracle inequalities for multi-fold cross validation. \newblock {\em Statist. Decisions}, 24(3):351--371. \bibitem[van Erven et~al., 2008]{Erv_Gru_Roo:2008} van Erven, T., Gr{\"u}nwald, P.~D., and de~Rooij, S. (2008). \newblock Catching up faster by switching sooner: A prequential solution to the aic-bic dilemma. \newblock arXiv:0807.1005. \bibitem[Vapnik, 1982]{Vap:1982} Vapnik, V. (1982). \newblock {\em Estimation of dependences based on empirical data}. \newblock Springer Series in Statistics. Springer-Verlag, New York. \newblock Translated from the Russian by Samuel Kotz. \bibitem[Vapnik, 1998]{Vap:1998} Vapnik, V.~N. (1998). \newblock {\em Statistical learning theory}. \newblock Adaptive and Learning Systems for Signal Processing, Communications, and Control. John Wiley \& Sons Inc., New York. \newblock A Wiley-Interscience Publication. \bibitem[Vapnik and Chervonenkis, 1974]{Vap_Cer:1974} Vapnik, V.~N. and Chervonenkis, A.~Y. (1974). \newblock {\em Teoriya raspoznavaniya obrazov. {S}tatisticheskie problemy obucheniya}. \newblock Izdat. ``Nauka'', Moscow. \newblock Theory of Pattern Recognition (In Russian). \bibitem[Wahba, 1975]{Wahb75} Wahba, G. (1975). \newblock {P}eriodic splines for spectral density estimation: {T}he use of cross validation for determining the degree of smoothing. \newblock {\em Communications in Statistics}, 4:125--142. \bibitem[Wahba, 1977]{Wahb77a} Wahba, G. (1977). \newblock {P}ractical {A}pproximate {S}olutions to {L}inear {O}perator {E}quations {W}hen the {D}ata are {N}oisy. \newblock {\em SIAM Journal on Numerical Analysis}, 14(4):651--667. \bibitem[Wegkamp, 2003]{Wegk03} Wegkamp, M. (2003). \newblock {M}odel selection in nonparametric regression. \newblock {\em The Annals of Statistics}, 31(1):252--273. \bibitem[Yang, 2005]{Yan:2005a} Yang, Y. (2005). \newblock Can the strengths of {AIC} and {BIC} be shared? {A} conflict between model indentification and regression estimation. \newblock {\em Biometrika}, 92(4):937--950. \bibitem[Yang, 2006]{Yan:2006} Yang, Y. (2006). \newblock Comparing learning methods for classification. \newblock {\em Statist. Sinica}, 16(2):635--657. \bibitem[Yang, 2007]{Yan:2007b} Yang, Y. (2007). \newblock Consistency of cross validation for comparing regression procedures. \newblock {\em Ann. Statist.}, 35(6):2450--2473. \bibitem[Zhang, 1993]{Zha:1993} Zhang, P. (1993). \newblock Model selection via multifold cross validation. \newblock {\em Ann. Statist.}, 21(1):299--313. \end{thebibliography}