I am using external memory training with #dtrain.cache as a suffix for the training data file. I am getting segmentation fault errors.
I saw this https://github.com/dmlc/xgboost/issues/1133 but it seems people got resolved with a fix. Any pointers on how can I avoid segmentation fault with external memory training ?
EDIT after @thvasilo comment
I ran the following code with valgrind with 200k instances and I got the error. I did not get any error when I ran the same script with 100k instances.
import xgboost as xgb
import numpy as np
import time
import multiprocessing
train_file = 'train_data.txt' #libsvm format
n_trees = 500
num_cpus = multiprocessing.cpu_count()
train_file_with_cache = train_file + "#dtrain.cache"
dtrain = xgb.DMatrix(train_file_with_cache)
param = {
'max_depth': 5,
'eta': 0.1,
'objective': 'binary:logistic',
'nthread': num_cpus
}
bst = xgb.train(param, dtrain, num_boost_round=n_trees)
Adding error after running with valgrind.
==6607== at 0x1A1F2D21: xgboost::tree::CQHistMakerxgboost::tree::GradStats::InitWorkSet(xgboost::DMatrix*, xgboost::RegTree const&, std::vector<unsigned int, std::allocator >) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A1F697D: xgboost::tree::HistMakerxgboost::tree::GradStats::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal >, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocatorxgboost::RegTree* > const&) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A25D75F: xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal >, xgboost::DMatrix, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_deletexgboost::RegTree >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_deletexgboost::RegTree > > >) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A25EC1E: xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal >, xgboost::ObjFunction) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A0E200D: xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A276664: XGBoosterUpdateOneIter (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x715EEB1: ffi_call_unix64 (unix64.S:76)
==6607== by 0x715E8D3: ffi_call (ffi64.c:525)
==6607== by 0x714D8F7: _call_function_pointer (callproc.c:836)
==6607== by 0x714E520: _ctypes_callproc (callproc.c:1179)
==6607== by 0x714710C: PyCFuncPtr_call (_ctypes.c:3965)
==6607== by 0x14B743: PyObject_Call (abstract.c:2546)
==6607== Address 0x5dca6f4 is 12 bytes after a block of size 344 alloc’d
==6607== at 0x4C2C21F: operator new(unsigned long) (vg_replace_malloc.c:334)
==6607== by 0x1A1F3384: xgboost::tree::CQHistMakerxgboost::tree::GradStats::InitWorkSet(xgboost::DMatrix*, xgboost::RegTree const&, std::vector<unsigned int, std::allocator >) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A1F697D: xgboost::tree::HistMakerxgboost::tree::GradStats::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal >, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocatorxgboost::RegTree* > const&) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A25D75F: xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal >, xgboost::DMatrix, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_deletexgboost::RegTree >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_deletexgboost::RegTree > > >) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A25EC1E: xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal >, xgboost::ObjFunction) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A0E200D: xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*) (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x1A276664: XGBoosterUpdateOneIter (in /home/username/python_debug_env/lib/python2.7/site-packages/xgboost-0.81-py2.7.egg/xgboost/lib/libxgboost.so)
==6607== by 0x715EEB1: ffi_call_unix64 (unix64.S:76)
==6607== by 0x715E8D3: ffi_call (ffi64.c:525)
==6607== by 0x714D8F7: _call_function_pointer (callproc.c:836)
==6607== by 0x714E520: _ctypes_callproc (callproc.c:1179)
==6607== by 0x714710C: PyCFuncPtr_call (_ctypes.c:3965)
==6607==
==6607== Invalid read of size 8
==6607== at 0x24D762: visit_decref (gcmodule.c:360)
==6607== by 0x18797A: dict_traverse (dictobject.c:2114)
==6607== by 0x24D853: subtract_refs (gcmodule.c:385)
==6607== by 0x24E8B5: collect (gcmodule.c:925)
==6607== by 0x24F639: PyGC_Collect (gcmodule.c:1440)
==6607== by 0x233A07: Py_Finalize (pythonrun.c:448)
==6607== by 0x140308: Py_Main (main.c:665)
==6607== by 0x13EDFF: main (python.c:23)
==6607== Address 0x3f8000000051f238 is not stack’d, malloc’d or (recently) free’d
==6607==
==6607==
==6607== Process terminating with default action of signal 11 (SIGSEGV)
==6607== General Protection Fault
==6607== at 0x24D762: visit_decref (gcmodule.c:360)
==6607== by 0x18797A: dict_traverse (dictobject.c:2114)
==6607== by 0x24D853: subtract_refs (gcmodule.c:385)
==6607== by 0x24E8B5: collect (gcmodule.c:925)
==6607== by 0x24F639: PyGC_Collect (gcmodule.c:1440)
==6607== by 0x233A07: Py_Finalize (pythonrun.c:448)
==6607== by 0x140308: Py_Main (main.c:665)
==6607== by 0x13EDFF: main (python.c:23)
==6607==
==6607== HEAP SUMMARY:
==6607== in use at exit: 214,472,668 bytes in 68,975 blocks
==6607== total heap usage: 197,280 allocs, 128,305 frees, 967,454,846 bytes allocated
==6607==
==6607== LEAK SUMMARY:
==6607== definitely lost: 104 bytes in 2 blocks
==6607== indirectly lost: 0 bytes in 0 blocks
==6607== possibly lost: 3,167,370 bytes in 18,705 blocks
==6607== still reachable: 211,305,162 bytes in 50,267 blocks
==6607== of which reachable via heuristic:
==6607== newarray : 560 bytes in 35 blocks
==6607== suppressed: 32 bytes in 1 blocks
==6607== Rerun with --leak-check=full to see details of leaked memory
==6607==
==6607== For counts of detected and suppressed errors, rerun with: -v
==6607== ERROR SUMMARY: 69 errors from 5 contexts (suppressed: 0 from 0)```