;========================================================================================================================================== ; How to split a big text file into smaller files of n lines size each? ; ; Detlev Dalitz.20040331.20100205. ;========================================================================================================================================== ;------------------------------------------------------------------------------------------------------------------------------------------ ; Wednesday, March 31, 2004 12:15 AM ; Hi, ; can you help me with following problem? ; I have "big" file with more than 2800000 lines. I need split this file to smaller files with 64000 lines. ; ; Thanks Patrik ; patrikm patrikm@moravia-it.com ;------------------------------------------------------------------------------------------------------------------------------------------ ; >How often do you have to do this task? ; Many times - I have database export and I need work with these data in Excel ; ; >Is the file a text file, line delimited by CRLF sequence? ; Lines are delimited with CRLF sequence ; ; >What filesize overall? ; About 400 MB ;------------------------------------------------------------------------------------------------------------------------------------------ ; For the test case we use this script as test input file. strFilename = IntControl (1004, 0, 0, 0, 0) ; Reality case is for example 2.800.000 lines, 400 MB. ;strFilename = "drive:\folder\bigfile.txt" ; <== Change path to your needs. intFilesize = FileSize (strFilename, 1) Terminate (intFilesize == "0", "Error", strFilename : @LF : "Filesize is zero.") ;---------------------------- @P1 = "{1}" @P2 = "{2}" strMsgTitle = "SplitBigFile" strMsgText = "Searching split points ..." BoxOpen (strMsgTitle, strMsgText) strMsgTextMask = strMsgText : @LF : strFilename : @LF : intFilesize : "/" : @P1 ; --- Pass 1 ---------------- ; We walk through the big file, ; and count the occurrences of search literal, ; and calculate where split points are, ; and collect split offsets into an itemlist. ; What do we search? We search for CRLF sequences in the big text file. strSearch = @CRLF intSearchLen = StrLen (strSearch) ; Big text file to split into files of n lines each. ;intSearchMax = 64000 ; <== Change number to your needs. <== intSearchMax = 20 ; For the test case we create split files with a size of 20 lines each (= 20 CRLF's). ; Chunk size can be adjusted to smaller or bigger chunks, depending on file size and system ressources. intChunksize = intFilesize / 100 ; <== Change chunk size to your needs. <== intChunkCount = 1 + (intFilesize / intChunksize) strListSplit = "" intOffsetFile = 0 intCountSearch = 0 hdlBB = BinaryAlloc (intChunksize) While intChunkCount intOffsetBB = 0 intResult = BinaryReadEx (hdlBB, intOffsetBB, strFilename, intOffsetFile, intChunksize) While intOffsetBB < intChunksize intOffsetBB = BinaryIndexEx (hdlBB, intOffsetBB, strSearch, @FWDSCAN, @TRUE) If intOffsetBB < 0 Then Break intOffsetBB = intOffsetBB + intSearchLen intCountSearch = intCountSearch + 1 If !(intCountSearch mod intSearchMax) intOffsetSplit = intOffsetFile + intOffsetBB strListSplit = ItemInsert (intOffsetSplit, -1, strListSplit, @TAB) BoxText (StrReplace (strMsgTextMask, @P1, intOffsetSplit)) EndIf EndWhile intChunkCount = intChunkCount - 1 intOffsetFile = intOffsetFile + intChunksize EndWhile If intOffsetSplit < intFilesize strListSplit = ItemInsert (intFilesize, -1, strListSplit, @TAB) BoxText (StrReplace (strMsgTextMask, @P1, intFilesize)) EndIf hdlBB = BinaryFree (hdlBB) ; --- Pass 2 ---------------- ; Create the split files. intCount = ItemCount (strListSplit, @TAB) intCountLen = StrLen (intCount) strMsgText = "Writing split files ..." BoxText (strMsgText) strMsgTextMask = strMsgText : @LF : intCount : "/" : @P1 : @LF : @P2 strFileOutMask = strFilename : ".part." : intCount : "." : @P1 : ".txt" intSplitBegin = 0 intSplitEnd = 0 intBBsizeMax = 0 hdlBB = BinaryAlloc (0) For intNum = 1 To intCount ; strNum = intNum ; Leave the counter number as is ... or for better filename sorting ... strNum = StrFixLeft (intNum, "0", intCountLen) ; ... make the counter number fixed length. intSplitEnd = ItemExtract (intNum, strListSplit, @TAB) intBBSize = intSplitEnd - intSplitBegin If intBBsize > intBBsizeMax intBBsizeMax = intBBSize hdlBB = BinaryFree (hdlBB) hdlBB = BinaryAlloc (intBBsize) EndIf strFilenameOut = StrReplace (strFileOutMask, @P1, strNum) intBytesRead = BinaryReadEx (hdlBB, 0, strFilename, intSplitBegin, intBBsize) intBytesWritten = BinaryWriteEx (hdlBB, 0, strFilenameOut, 0, intBBSize) intSplitBegin = intSplitEnd BoxText (StrReplace (StrReplace (strMsgTextMask, @P1, strNum), @P2, strFilenameOut)) Next hdlBB = BinaryFree (hdlBB) BoxShut () ; Look into the folder. Run ("explorer.exe", "/select, " : strFileName) Exit ;==========================================================================================================================================