Tensorflow reads multi csv needs too much time

I plan to read the feature data from multi csv files.

Each csv file has 150 columns, and the batch size is 256.

The time cost for just read 1000 iteration needs roughly 12s.

I feel the time cost for that should not be that much, any one here can have some suggestion?

def _parse_csv_row(*vals):

    features = tf.convert_to_tensor(vals[0:f_size * 5])

    class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)

    return features, class_label





def get_batch_data(name):

    root_path="g:\market\2018-11-12\feature_{}\".format(name)

    file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))

    record_defaults = [tf.float32] * f_size * 5 + [tf.float64]

    selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]

    dataset = tf.contrib.data.CsvDataset(

        file_queue,

        record_defaults,

        buffer_size=1024 * 1024 * 10,

        header=True,

        na_value='0.0',

        select_cols=selected_cols)

    dataset = dataset.apply(tf.contrib.data.map_and_batch(

        map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))

    dataset = dataset.prefetch(256 * 1024)

    dataset = dataset.repeat()



    dataset = dataset.shuffle(buffer_size=32)

    iterator = dataset.make_one_shot_iterator()

    feature_batch, label_batch = iterator.get_next()

    return feature_batch, label_batch



config = tf.ConfigProto()

config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:

    sess.run(tf.global_variables_initializer())

    a, b = get_batch_data("train")

    start_time = time.time()

    for x in range(1000):

        v = sess.run([a,b])

    print(time.time() - start_time)

asked Nov 26 '18 at 12:34

Carpemer

415

add a comment |

I plan to read the feature data from multi csv files.

Each csv file has 150 columns, and the batch size is 256.

The time cost for just read 1000 iteration needs roughly 12s.

I feel the time cost for that should not be that much, any one here can have some suggestion?

def _parse_csv_row(*vals):

    features = tf.convert_to_tensor(vals[0:f_size * 5])

    class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)

    return features, class_label





def get_batch_data(name):

    root_path="g:\market\2018-11-12\feature_{}\".format(name)

    file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))

    record_defaults = [tf.float32] * f_size * 5 + [tf.float64]

    selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]

    dataset = tf.contrib.data.CsvDataset(

        file_queue,

        record_defaults,

        buffer_size=1024 * 1024 * 10,

        header=True,

        na_value='0.0',

        select_cols=selected_cols)

    dataset = dataset.apply(tf.contrib.data.map_and_batch(

        map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))

    dataset = dataset.prefetch(256 * 1024)

    dataset = dataset.repeat()



    dataset = dataset.shuffle(buffer_size=32)

    iterator = dataset.make_one_shot_iterator()

    feature_batch, label_batch = iterator.get_next()

    return feature_batch, label_batch



config = tf.ConfigProto()

config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:

    sess.run(tf.global_variables_initializer())

    a, b = get_batch_data("train")

    start_time = time.time()

    for x in range(1000):

        v = sess.run([a,b])

    print(time.time() - start_time)

asked Nov 26 '18 at 12:34

Carpemer

415

add a comment |

I plan to read the feature data from multi csv files.

Each csv file has 150 columns, and the batch size is 256.

The time cost for just read 1000 iteration needs roughly 12s.

I feel the time cost for that should not be that much, any one here can have some suggestion?

def _parse_csv_row(*vals):

    features = tf.convert_to_tensor(vals[0:f_size * 5])

    class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)

    return features, class_label





def get_batch_data(name):

    root_path="g:\market\2018-11-12\feature_{}\".format(name)

    file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))

    record_defaults = [tf.float32] * f_size * 5 + [tf.float64]

    selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]

    dataset = tf.contrib.data.CsvDataset(

        file_queue,

        record_defaults,

        buffer_size=1024 * 1024 * 10,

        header=True,

        na_value='0.0',

        select_cols=selected_cols)

    dataset = dataset.apply(tf.contrib.data.map_and_batch(

        map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))

    dataset = dataset.prefetch(256 * 1024)

    dataset = dataset.repeat()



    dataset = dataset.shuffle(buffer_size=32)

    iterator = dataset.make_one_shot_iterator()

    feature_batch, label_batch = iterator.get_next()

    return feature_batch, label_batch



config = tf.ConfigProto()

config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:

    sess.run(tf.global_variables_initializer())

    a, b = get_batch_data("train")

    start_time = time.time()

    for x in range(1000):

        v = sess.run([a,b])

    print(time.time() - start_time)

asked Nov 26 '18 at 12:34

Carpemer

415

I plan to read the feature data from multi csv files.

Each csv file has 150 columns, and the batch size is 256.

The time cost for just read 1000 iteration needs roughly 12s.

I feel the time cost for that should not be that much, any one here can have some suggestion?

def _parse_csv_row(*vals):

    features = tf.convert_to_tensor(vals[0:f_size * 5])

    class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)

    return features, class_label





def get_batch_data(name):

    root_path="g:\market\2018-11-12\feature_{}\".format(name)

    file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))

    record_defaults = [tf.float32] * f_size * 5 + [tf.float64]

    selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]

    dataset = tf.contrib.data.CsvDataset(

        file_queue,

        record_defaults,

        buffer_size=1024 * 1024 * 10,

        header=True,

        na_value='0.0',

        select_cols=selected_cols)

    dataset = dataset.apply(tf.contrib.data.map_and_batch(

        map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))

    dataset = dataset.prefetch(256 * 1024)

    dataset = dataset.repeat()



    dataset = dataset.shuffle(buffer_size=32)

    iterator = dataset.make_one_shot_iterator()

    feature_batch, label_batch = iterator.get_next()

    return feature_batch, label_batch



config = tf.ConfigProto()

config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:

    sess.run(tf.global_variables_initializer())

    a, b = get_batch_data("train")

    start_time = time.time()

    for x in range(1000):

        v = sess.run([a,b])

    print(time.time() - start_time)

tensorflow tensorflow-datasets

asked Nov 26 '18 at 12:34

Carpemer

415

asked Nov 26 '18 at 12:34

Carpemer

415

asked Nov 26 '18 at 12:34

Carpemer

415

asked Nov 26 '18 at 12:34

Carpemer

415

asked Nov 26 '18 at 12:34

Carpemer

415

add a comment |

1 Answer
1

active

oldest

votes

The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.

I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.

Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.

answered Nov 26 '18 at 14:12

Vlad-HC

1,0341015

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53481231%2ftensorflow-reads-multi-csv-needs-too-much-time%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.

answered Nov 26 '18 at 14:12

Vlad-HC

1,0341015

add a comment |

I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.

answered Nov 26 '18 at 14:12

Vlad-HC

1,0341015

add a comment |

I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.

answered Nov 26 '18 at 14:12

Vlad-HC

1,0341015

I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.

answered Nov 26 '18 at 14:12

Vlad-HC

1,0341015

answered Nov 26 '18 at 14:12

Vlad-HC

1,0341015

answered Nov 26 '18 at 14:12

Vlad-HC

1,0341015

answered Nov 26 '18 at 14:12

Vlad-HC

1,0341015

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Ytukyg