NotificationsYou must be signed in to change notification settings
Fork0
Star0

Commit9828e28

committed

added csv_split

1 parent8b65a7c commit9828e28Copy full SHA for 9828e28

File tree

1 file changed

+114

-150

lines changed

12_csv_split.py

1 file changed

+114

-150

lines changed

`‎12_csv_split.py`

Lines changed: 114 additions & 150 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,167 +1,131 @@`
`1`		`-### WIP`
`2`		`-`
`3`	`1`	`importsys`
`4`	`2`	`importos`
`5`		`-importgetopt`
`6`	`3`	`importcsv`
	`4`	`+importargparse`
`7`	`5`
`8`	`6`	`"""`
`9`		`- Splits a CSV file into multiple pieces based on command line arguments.`
	`7`	`+`
	`8`	`+Splits a CSV file into multiple pieces based on command line arguments.`
`10`	`9`
`11`	`10`	`Arguments:`
`12`		- `-h`: help file of usage of the script
`13`		- `-i`:input filename
`14`		- `-o`:output file, A %s-style template for the numbered output files.
`15`		- `-r`:row limit to split
`16`		- `-c`:A %s-style template for the numbered output files.
	`11`	`+`
	`12`	+`-h`:help fileof usage of the script
	`13`	+`-i`:input file name
	`14`	+`-o`:output file name
	`15`	+`-r`:row limit to split
`17`	`16`
`18`	`17`	`Default settings:`
`19`		- `output_path` is the current directory
`20`		- `keep_headers` is on (headers will be kept)
`21`		- `delimeter` is ,
	`18`	`+`
	`19`	+ `output_path` is the current directory
	`20`	`+ headers are displayed on each split file`
	`21`	`+ the default delimeter is a comma`
`22`	`22`
`23`	`23`	`Example usage:`
`24`		`- # split by every 10000 rows`
`25`		`- >> python 12_csv_split.py -i input.csv -o rownumber -r 10000`
`26`		`- # split by unique items in column 0`
`27`		`- >> python 12_csv_split.py -i input.csv -o userid -c 0`
`28`		`- # access help`
`29`		`- >> python 12_csv_split.py -h for help`
`30`		`-`
	`24`	`+`
	`25`	+ ```
	`26`	`+ # split csv by every 100 rows`
	`27`	`+ >> python csv_split.py -i input.csv -o output -r 100`
	`28`	+ ```
	`29`	`+`
`31`	`30`	`"""`
`32`	`31`
`33`		`-defmain(argv):`
`34`		`-`
`35`		`-argument_dict=grab_command_line_arguments(argv)`
`36`		`-parse_file(argument_dict)`
`37`		`-`
`38`		`-`
`39`		`-defgrab_command_line_arguments(argv):`
`40`		`-`
`41`		`-# global variables`
`42`		`-inputfile=''`
`43`		`-outputfile=''`
`44`		`-rowlimit=''`
`45`		`-columnindex=''`
`46`		`-argument_dict= {}`
`47`		`-`
`48`		`-# grab arguments`
`49`		`-opts,args=getopt.getopt(argv,"hi:o:r:c:",["ifile=","ofile=","rowlimit=","columnindex="])`
`50`		`-`
`51`		`-# end if no arguments provided`
`52`		`-ifnotopts:`
`53`		-print"No options provided. Try again. Use `-h` for help."
`54`		`-sys.exit()`
`55`		`-`
`56`		`-# grab arguments`
`57`		`-foropt,arginopts:`
`58`		`-ifopt=='-h':`
`59`		`-print'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'`
`60`		`-sys.exit()`
`61`		`-elifoptin ("-i","--ifile"):`
`62`		`-inputfile=arg`
`63`		`-elifoptin ("-o","--ofile"):`
`64`		`-outputfile=arg`
`65`		`-elifoptin ("-r","--rowlimit"):`
`66`		`-rowlimit=arg`
`67`		`-elifoptin ("-c","--columnindex"):`
`68`		`-columnindex=arg`
`69`		`-`
`70`		`-# Output arguments`
`71`		`-print"\nArguments:"`
`72`		`-ifinputfile:`
`73`		`-argument_dict["input_file"]=inputfile`
`74`		`-print"Input file is '{}'".format(inputfile)`
`75`		`-else:`
`76`		`-"Please enter an input file."`
`77`		`-ifoutputfile:`
`78`		`-argument_dict["output_file"]=outputfile`
`79`		`-print"Output file is '{}'".format(outputfile)`
`80`		`-else:`
`81`		`-print"Please enter an output file."`
`82`		`-ifrowlimit:`
`83`		`-argument_dict["rowlimit"]=rowlimit`
`84`		`-print"Rowlimit is '{}'".format(rowlimit)`
`85`		`-ifcolumnindex:`
`86`		`-argument_dict["columnindex"]=columnindex`
`87`		`-print"Columnindex is '{}'".format(columnindex)`
`88`		`-ifrowlimitandcolumnindex:`
`89`		`-print"Please use either a rowlimit or columnlimit, not both."`
`90`		`-sys.exit()`
`91`		`-ifnotrowlimitorcolumnindex:`
`92`		`-print"Please enter either a rowlimit or columnlimit."`
`93`		`-sys.exit()`
`94`		`-`
`95`		`-# to do - check to make sure file, rowlimit, and columnlimit exist`
`96`		`-printargument_dict`
`97`		`-returnargument_dict`
`98`		`-`
`99`		`-`
`100`		`-defparse_file(argument_dict):`
`101`		`-`
`102`		`-#split csv file by certain rownumber`
`103`		`-ifargument_dict["rowlimit"]:`
`104`		`-rowlimit=int(argument_dict["rowlimit"])`
`105`		`-output_name_file="{}.csv".format(argument_dict["output_file"])`
`106`		`-output_path='.'`
`107`		`-keep_headers=True`
`108`		`-delimiter=','`
`109`		`-filehandler=open(argument_dict["input_file"],'r')`
`110`		`-reader=csv.reader(filehandler,delimiter=delimiter)`
`111`		`-current_piece=1`
`112`		`-current_out_path=os.path.join(`
`113`		`-output_path,`
`114`		`-output_name_file`
	`32`	`+`
	`33`	`+defget_arguments():`
	`34`	`+"""Grab user supplied arguments using the argparse library."""`
	`35`	`+`
	`36`	`+# Use arparse to get command line arguments`
	`37`	`+parser=argparse.ArgumentParser()`
	`38`	`+parser.add_argument("-i","--input_file",required=True,`
	`39`	`+help="csv input file (with extension)",type=str)`
	`40`	`+parser.add_argument("-o","--output_file",required=True,`
	`41`	`+help="csv output file (without extension)",type=str)`
	`42`	`+parser.add_argument("-r","--row_limit",required=True,`
	`43`	`+help="row limit to split csv at",type=int)`
	`44`	`+args=parser.parse_args()`
	`45`	`+`
	`46`	`+# Check if the input_file exits`
	`47`	`+is_valid_file(parser,args.input_file)`
	`48`	`+`
	`49`	`+# Check if the input_file is valid`
	`50`	`+is_valid_csv(parser,args.input_file,args.row_limit)`
	`51`	`+`
	`52`	`+returnargs.input_file,args.output_file,args.row_limit`
	`53`	`+`
	`54`	`+`
	`55`	`+defis_valid_file(parser,file_name):`
	`56`	`+"""Ensure that the input_file exists."""`
	`57`	`+ifnotos.path.exists(file_name):`
	`58`	`+parser.error("The file '{}' does not exist!".format(file_name))`
	`59`	`+sys.exit(1)`
	`60`	`+`
	`61`	`+`
	`62`	`+defis_valid_csv(parser,file_name,row_limit):`
	`63`	`+"""`
	`64`	`+ Ensure that the # of rows in the input_file`
	`65`	`+ is greater than the row_limit.`
	`66`	`+ """`
	`67`	`+row_count=0`
	`68`	`+forrowincsv.reader(open(file_name)):`
	`69`	`+row_count+=1`
	`70`	`+# Note: You could also use a generator expression`
	`71`	`+# and the sum() function to count the rows:`
	`72`	`+# row_count = sum(1 for row in csv.reader(open(file_name)))`
	`73`	`+ifrow_limit>row_count:`
	`74`	`+parser.error(`
	`75`	`+"The 'row_count' of '{}' is > the number of rows in '{}'!"`
	`76`	`+ .format(row_limit,file_name)`
`115`	`77`	`)`
`116`		`-current_out_writer=csv.writer(open(current_out_path,'w'),delimiter=delimiter)`
`117`		`-current_limit=rowlimit`
`118`		`-ifkeep_headers:`
`119`		`-headers=reader.next()`
`120`		`-current_out_writer.writerow(headers)`
`121`		`-fori,rowinenumerate(reader):`
`122`		`-ifi+1>current_limit:`
`123`		`-current_piece+=1`
`124`		`-current_limit=rowlimit*current_piece`
`125`		`-current_out_path=os.path.join(`
`126`		`-output_path,`
`127`		`-output_name_file`
`128`		`- )`
`129`		`-current_out_writer=csv.writer(open(current_out_path,'w'),delimiter=delimiter)`
`130`		`-`
`131`		`-# elif columnindex: #split csv file accrording to unique values of certain column,it's like filter only certain item in excel`
`132`		`-# itemlist = []`
`133`		`-# columnindex = int(columnindex)`
`134`		`-# output_name_template= outputfile+'_%s.csv'`
`135`		`-# output_path='.'`
`136`		`-# keep_headers=True`
`137`		`-# delimiter=','`
`138`		`-# filehandler = open(inputfile,'r')`
`139`		`-# reader = csv.reader(filehandler, delimiter=delimiter)`
`140`		`-# if keep_headers:`
`141`		`-# headers = reader.next()`
`142`		`-`
`143`		`-# for i, row in enumerate(reader):`
`144`		`-`
`145`		`-# current_out_path = os.path.join(`
`146`		`-# output_path,`
`147`		`-# output_name_template % row[columnindex] )`
`148`		`-# if row[columnindex] not in itemlist:`
`149`		`-# try:`
`150`		`-# current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)`
`151`		`-# except IOError:`
`152`		`-# continue`
`153`		`-# else:`
`154`		`-# itemlist.append(row[columnindex])`
`155`		`-# if keep_headers:`
`156`		`-# current_out_writer.writerow(headers)`
`157`		`-# current_out_writer.writerow(row)`
`158`		`-# else:`
`159`		`-# current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)`
`160`		`-# current_out_writer.writerow(row)`
`161`		`-# print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)`
`162`		`-# else:`
`163`		`-# print "oops, please check instruction of script by >>./csvsplit.py -h"`
	`78`	`+sys.exit(1)`
	`79`	`+`
	`80`	`+`
	`81`	`+defparse_file(arguments):`
	`82`	`+"""`
	`83`	`+ Splits the CSV into multiple files or chunks based on the row_limit.`
	`84`	`+ Then create new CSV files.`
	`85`	`+ """`
	`86`	`+input_file=arguments[0]`
	`87`	`+output_file=arguments[1]`
	`88`	`+row_limit=arguments[2]`
	`89`	`+output_path='.'# Current directory`
	`90`	`+`
	`91`	`+# Read CSV, split into list of lists`
	`92`	`+withopen(input_file,'r')asinput_csv:`
	`93`	`+datareader=csv.reader(input_csv)`
	`94`	`+all_rows= []`
	`95`	`+forrowindatareader:`
	`96`	`+all_rows.append(row)`
	`97`	`+`
	`98`	`+# Remove header`
	`99`	`+header=all_rows.pop(0)`
	`100`	`+`
	`101`	`+# Split list of list into chunks`
	`102`	`+current_chunk=0`
	`103`	`+foriinrange(0,len(all_rows),row_limit):# Loop through list`
	`104`	`+chunk=all_rows[i:i+row_limit]# Create single chunk`
	`105`	`+`
	`106`	`+current_output=os.path.join(# Create new output file`
	`107`	`+output_path,`
	`108`	`+"{}-{}.csv".format(output_file,current_chunk)`
	`109`	`+ )`
	`110`	`+`
	`111`	`+# Add header`
	`112`	`+chunk.insert(0,header)`
	`113`	`+`
	`114`	`+# Write chunk to output file`
	`115`	`+withopen(current_output,'w')asoutput_csv:`
	`116`	`+writer=csv.writer(output_csv)`
	`117`	`+writer=writer.writerows(chunk)`
	`118`	`+`
	`119`	`+# Output info`
	`120`	`+print""`
	`121`	`+print"Chunk # {}:".format(current_chunk)`
	`122`	`+print"Filepath: {}".format(current_output)`
	`123`	`+print"# of rows: {}".format(len(chunk))`
	`124`	`+`
	`125`	`+# Create new chunk`
	`126`	`+current_chunk+=1`
`164`	`127`
`165`	`128`
`166`	`129`	`if__name__=="__main__":`
`167`		`-main(sys.argv[1:])`
	`130`	`+arguments=get_arguments()`
	`131`	`+parse_file(arguments)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit9828e28

File tree

1 file changed

1 file changed

`‎12_csv_split.py`

0 commit comments