Normally, the following method is used, but it consumes memory for the file size.
with open('/path/to/not-sorted-file', 'r') as fr:
new_lines = sorted(fr.readlines()) #Expand to memory for file size
with open('/path/to/sorted-file', 'wb') as fw:
fw.write(''.join(new_lines))
I wrote a function. I'm doing something like ↓.
fr
fw
fr
to the file offset value.line = fr.readline ()
fw.write (line)
This method consumes memory for "substring x number of lines" (; _;)
import os
import uuid
import tempfile
def sort_large_file(filename, key=lambda l: l[:5]):
'''
sort large file without on-memory.
:param str filename: abspath of file.
:param function key: the function makes sort-key from a line.
'''
#Save the file before sorting.
tmpname = os.path.join(tempfile.gettempdir(), 'sortlargefile_%s' % (uuid.uuid4().get_hex()))
os.rename(filename, tmpname)
# make a list of offsets.
offset_list = []
with open(tmpname, 'r') as fr:
while True:
offset = fr.tell()
line = fr.readline()
if not line:
break
keyword = key(line)
offset_list.append((keyword, offset, ))
# sort offsets.
offset_list.sort(key=lambda e: e[0])
# sort (write to new file).
with open(filename, 'wb') as fw, open(tmpname, 'r') as fr:
for keyword, offset in offset_list:
fr.seek(offset)
line = fr.readline()
fw.write(line)
# remove tmp.
os.remove(tmpname)
Call the function as follows.
> sort_large_file('/path/to/your/file', lambda l: l[:l.find(',')])
↓ This is the original CSV.
2016-10-01,apple,red
2016-09-29,orange,orange
2015-12-21,banana,yellow
The sort_large_file () function also requires a line break on the last line.
↓ It will be sorted like this.
2015-12-21,banana,yellow
2016-09-29,orange,orange
2016-10-01,apple,red
Recommended Posts