Tensorflow reads multi csv needs too much time
I plan to read the feature data from multi csv files.
Each csv file has 150 columns, and the batch size is 256.
The time cost for just read 1000 iteration needs roughly 12s.
I feel the time cost for that should not be that much, any one here can have some suggestion?
def _parse_csv_row(*vals):
features = tf.convert_to_tensor(vals[0:f_size * 5])
class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)
return features, class_label
def get_batch_data(name):
root_path="g:\market\2018-11-12\feature_{}\".format(name)
file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))
record_defaults = [tf.float32] * f_size * 5 + [tf.float64]
selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]
dataset = tf.contrib.data.CsvDataset(
file_queue,
record_defaults,
buffer_size=1024 * 1024 * 10,
header=True,
na_value='0.0',
select_cols=selected_cols)
dataset = dataset.apply(tf.contrib.data.map_and_batch(
map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))
dataset = dataset.prefetch(256 * 1024)
dataset = dataset.repeat()
dataset = dataset.shuffle(buffer_size=32)
iterator = dataset.make_one_shot_iterator()
feature_batch, label_batch = iterator.get_next()
return feature_batch, label_batch
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
a, b = get_batch_data("train")
start_time = time.time()
for x in range(1000):
v = sess.run([a,b])
print(time.time() - start_time)
tensorflow tensorflow-datasets
add a comment |
I plan to read the feature data from multi csv files.
Each csv file has 150 columns, and the batch size is 256.
The time cost for just read 1000 iteration needs roughly 12s.
I feel the time cost for that should not be that much, any one here can have some suggestion?
def _parse_csv_row(*vals):
features = tf.convert_to_tensor(vals[0:f_size * 5])
class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)
return features, class_label
def get_batch_data(name):
root_path="g:\market\2018-11-12\feature_{}\".format(name)
file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))
record_defaults = [tf.float32] * f_size * 5 + [tf.float64]
selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]
dataset = tf.contrib.data.CsvDataset(
file_queue,
record_defaults,
buffer_size=1024 * 1024 * 10,
header=True,
na_value='0.0',
select_cols=selected_cols)
dataset = dataset.apply(tf.contrib.data.map_and_batch(
map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))
dataset = dataset.prefetch(256 * 1024)
dataset = dataset.repeat()
dataset = dataset.shuffle(buffer_size=32)
iterator = dataset.make_one_shot_iterator()
feature_batch, label_batch = iterator.get_next()
return feature_batch, label_batch
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
a, b = get_batch_data("train")
start_time = time.time()
for x in range(1000):
v = sess.run([a,b])
print(time.time() - start_time)
tensorflow tensorflow-datasets
add a comment |
I plan to read the feature data from multi csv files.
Each csv file has 150 columns, and the batch size is 256.
The time cost for just read 1000 iteration needs roughly 12s.
I feel the time cost for that should not be that much, any one here can have some suggestion?
def _parse_csv_row(*vals):
features = tf.convert_to_tensor(vals[0:f_size * 5])
class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)
return features, class_label
def get_batch_data(name):
root_path="g:\market\2018-11-12\feature_{}\".format(name)
file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))
record_defaults = [tf.float32] * f_size * 5 + [tf.float64]
selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]
dataset = tf.contrib.data.CsvDataset(
file_queue,
record_defaults,
buffer_size=1024 * 1024 * 10,
header=True,
na_value='0.0',
select_cols=selected_cols)
dataset = dataset.apply(tf.contrib.data.map_and_batch(
map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))
dataset = dataset.prefetch(256 * 1024)
dataset = dataset.repeat()
dataset = dataset.shuffle(buffer_size=32)
iterator = dataset.make_one_shot_iterator()
feature_batch, label_batch = iterator.get_next()
return feature_batch, label_batch
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
a, b = get_batch_data("train")
start_time = time.time()
for x in range(1000):
v = sess.run([a,b])
print(time.time() - start_time)
tensorflow tensorflow-datasets
I plan to read the feature data from multi csv files.
Each csv file has 150 columns, and the batch size is 256.
The time cost for just read 1000 iteration needs roughly 12s.
I feel the time cost for that should not be that much, any one here can have some suggestion?
def _parse_csv_row(*vals):
features = tf.convert_to_tensor(vals[0:f_size * 5])
class_label = tf.cast(vals[f_size * 5] + tf.convert_to_tensor(1.0, tf.float64), tf.int64)
return features, class_label
def get_batch_data(name):
root_path="g:\market\2018-11-12\feature_{}\".format(name)
file_queue = list(map(lambda x: "{}{}".format(root_path, x), fnmatch.filter(os.listdir("g:\market\2018-11-12\feature_{}\".format(name)), "*.sz_result.csv")))
record_defaults = [tf.float32] * f_size * 5 + [tf.float64]
selected_cols = reduce(lambda x, y: x + y, [list(range(1 + x * 29, 1 + x * 29 + 9)) for x in range(0, 5)]) + [146]
dataset = tf.contrib.data.CsvDataset(
file_queue,
record_defaults,
buffer_size=1024 * 1024 * 10,
header=True,
na_value='0.0',
select_cols=selected_cols)
dataset = dataset.apply(tf.contrib.data.map_and_batch(
map_func=_parse_csv_row, batch_size=train_config.BATCH_SIZE))
dataset = dataset.prefetch(256 * 1024)
dataset = dataset.repeat()
dataset = dataset.shuffle(buffer_size=32)
iterator = dataset.make_one_shot_iterator()
feature_batch, label_batch = iterator.get_next()
return feature_batch, label_batch
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
a, b = get_batch_data("train")
start_time = time.time()
for x in range(1000):
v = sess.run([a,b])
print(time.time() - start_time)
tensorflow tensorflow-datasets
tensorflow tensorflow-datasets
asked Nov 26 '18 at 12:34
CarpemerCarpemer
415
415
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.
I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.
Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53481231%2ftensorflow-reads-multi-csv-needs-too-much-time%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.
I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.
Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.
add a comment |
The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.
I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.
Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.
add a comment |
The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.
I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.
Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.
The line dataset = dataset.prefetch(256 * 1024). It's written after map_and_batch operation. That means that your are prefetching 256 * 1024 batches. So, when your program tries to load 1st record, it actually first loads 256 * 1024 * 256 records. Probably your intention was to prefetch only 1024 batches. In real life it's enough to prefetch only one record.
I would put ds.prefetch(1) line as the last operation on the dataset. See Summary of Best Practices.
Also you allocate pretty big buffer for reading CSV file (buffer_size parameter). If your intention is to cache the whole csv file, you can use ds.cache() operation. Without parameters it caches contents in the memory. Put it before the ds.repeat() operation.
answered Nov 26 '18 at 14:12
Vlad-HCVlad-HC
1,0341015
1,0341015
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53481231%2ftensorflow-reads-multi-csv-needs-too-much-time%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown