pushdump.patch
diff --git a/scripts/iterate_folder.py b/scripts/iterate_folder.py
index 27a7fbd..c3bf2cf 100644
--- a/scripts/iterate_folder.py
+++ b/scripts/iterate_folder.py
@@ -25,7 +25,7 @@ def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, by
except UnicodeDecodeError:
if bytes_read > max_window_size:
raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
-
log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
-
#log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
@@ -59,7 +59,7 @@ for subdir, dirs, files in os.walk(input_folder):
total_size += file_size
input_files.append([input_path, file_size])
-log.info(f"Processing {len(input_files)} files of {(total_size / (230)):.2f} gigabytes")
+#log.info(f"Processing {len(input_files)} files of {(total_size / (230)):.2f} gigabytes")
total_lines = 0
total_bytes_processed = 0
@@ -69,14 +69,17 @@ for input_file in input_files:
created = None
for line, file_bytes_processed in read_lines_zst(input_file[0]):
obj = json.loads(line)
-
if obj['author'] == 'spez': # your reddit username here w/o the u/
-
#print(obj)
-
print(obj['id']) created = datetime.utcfromtimestamp(int(obj['created_utc'])) file_lines += 1
-
if file_lines == 1:
-
log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : 0% : {(total_bytes_processed / total_size) * 100:.0f}%")
-
if file_lines % 100000 == 0:
-
log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : {(file_bytes_processed / input_file[1]) * 100:.0f}% : {(total_bytes_processed / total_size) * 100:.0f}%")
-
#if file_lines == 1:
-
#log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : 0% : {(total_bytes_processed / total_size) * 100:.0f}%")
-
#if file_lines % 100000 == 0:
-
total_lines += file_lines#log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : {(file_bytes_processed / input_file[1]) * 100:.0f}% : {(total_bytes_processed / total_size) * 100:.0f}%")
total_bytes_processed += input_file[1]
- log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : 100% : {(total_bytes_processed / total_size) * 100:.0f}%")
- #log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : 100% : {(total_bytes_processed / total_size) * 100:.0f}%")
-log.info(f"Total: {total_lines}")
+#log.info(f"Total: {total_lines}")
Add comment