Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9828e28

Browse files
committed
added csv_split
1 parent8b65a7c commit9828e28

File tree

1 file changed

+114
-150
lines changed

1 file changed

+114
-150
lines changed

‎12_csv_split.py

Lines changed: 114 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -1,167 +1,131 @@
1-
### WIP
2-
31
importsys
42
importos
5-
importgetopt
63
importcsv
4+
importargparse
75

86
"""
9-
Splits a CSV file into multiple pieces based on command line arguments.
7+
8+
Splits a CSV file into multiple pieces based on command line arguments.
109
1110
Arguments:
12-
`-h`: help file of usage of the script
13-
`-i`:input filename
14-
`-o`:output file, A %s-style template for the numbered output files.
15-
`-r`:row limit to split
16-
`-c`:A %s-style template for the numbered output files.
11+
12+
`-h`:help fileof usage of the script
13+
`-i`:input file name
14+
`-o`:output file name
15+
`-r`:row limit to split
1716
1817
Default settings:
19-
`output_path` is the current directory
20-
`keep_headers` is on (headers will be kept)
21-
`delimeter` is ,
18+
19+
`output_path` is the current directory
20+
headers are displayed on each split file
21+
the default delimeter is a comma
2222
2323
Example usage:
24-
# split by every 10000 rows
25-
>> python 12_csv_split.py -i input.csv -o rownumber -r 10000
26-
# split by unique items in column 0
27-
>> python 12_csv_split.py -i input.csv -o userid -c 0
28-
# access help
29-
>> python 12_csv_split.py -h for help
30-
24+
25+
```
26+
# split csv by every 100 rows
27+
>> python csv_split.py -i input.csv -o output -r 100
28+
```
29+
3130
"""
3231

33-
defmain(argv):
34-
35-
argument_dict=grab_command_line_arguments(argv)
36-
parse_file(argument_dict)
37-
38-
39-
defgrab_command_line_arguments(argv):
40-
41-
# global variables
42-
inputfile=''
43-
outputfile=''
44-
rowlimit=''
45-
columnindex=''
46-
argument_dict= {}
47-
48-
# grab arguments
49-
opts,args=getopt.getopt(argv,"hi:o:r:c:",["ifile=","ofile=","rowlimit=","columnindex="])
50-
51-
# end if no arguments provided
52-
ifnotopts:
53-
print"No options provided. Try again. Use `-h` for help."
54-
sys.exit()
55-
56-
# grab arguments
57-
foropt,arginopts:
58-
ifopt=='-h':
59-
print'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'
60-
sys.exit()
61-
elifoptin ("-i","--ifile"):
62-
inputfile=arg
63-
elifoptin ("-o","--ofile"):
64-
outputfile=arg
65-
elifoptin ("-r","--rowlimit"):
66-
rowlimit=arg
67-
elifoptin ("-c","--columnindex"):
68-
columnindex=arg
69-
70-
# Output arguments
71-
print"\nArguments:"
72-
ifinputfile:
73-
argument_dict["input_file"]=inputfile
74-
print"Input file is '{}'".format(inputfile)
75-
else:
76-
"Please enter an input file."
77-
ifoutputfile:
78-
argument_dict["output_file"]=outputfile
79-
print"Output file is '{}'".format(outputfile)
80-
else:
81-
print"Please enter an output file."
82-
ifrowlimit:
83-
argument_dict["rowlimit"]=rowlimit
84-
print"Rowlimit is '{}'".format(rowlimit)
85-
ifcolumnindex:
86-
argument_dict["columnindex"]=columnindex
87-
print"Columnindex is '{}'".format(columnindex)
88-
ifrowlimitandcolumnindex:
89-
print"Please use either a rowlimit or columnlimit, not both."
90-
sys.exit()
91-
ifnotrowlimitorcolumnindex:
92-
print"Please enter either a rowlimit or columnlimit."
93-
sys.exit()
94-
95-
# to do - check to make sure file, rowlimit, and columnlimit exist
96-
printargument_dict
97-
returnargument_dict
98-
99-
100-
defparse_file(argument_dict):
101-
102-
#split csv file by certain rownumber
103-
ifargument_dict["rowlimit"]:
104-
rowlimit=int(argument_dict["rowlimit"])
105-
output_name_file="{}.csv".format(argument_dict["output_file"])
106-
output_path='.'
107-
keep_headers=True
108-
delimiter=','
109-
filehandler=open(argument_dict["input_file"],'r')
110-
reader=csv.reader(filehandler,delimiter=delimiter)
111-
current_piece=1
112-
current_out_path=os.path.join(
113-
output_path,
114-
output_name_file
32+
33+
defget_arguments():
34+
"""Grab user supplied arguments using the argparse library."""
35+
36+
# Use arparse to get command line arguments
37+
parser=argparse.ArgumentParser()
38+
parser.add_argument("-i","--input_file",required=True,
39+
help="csv input file (with extension)",type=str)
40+
parser.add_argument("-o","--output_file",required=True,
41+
help="csv output file (without extension)",type=str)
42+
parser.add_argument("-r","--row_limit",required=True,
43+
help="row limit to split csv at",type=int)
44+
args=parser.parse_args()
45+
46+
# Check if the input_file exits
47+
is_valid_file(parser,args.input_file)
48+
49+
# Check if the input_file is valid
50+
is_valid_csv(parser,args.input_file,args.row_limit)
51+
52+
returnargs.input_file,args.output_file,args.row_limit
53+
54+
55+
defis_valid_file(parser,file_name):
56+
"""Ensure that the input_file exists."""
57+
ifnotos.path.exists(file_name):
58+
parser.error("The file '{}' does not exist!".format(file_name))
59+
sys.exit(1)
60+
61+
62+
defis_valid_csv(parser,file_name,row_limit):
63+
"""
64+
Ensure that the # of rows in the input_file
65+
is greater than the row_limit.
66+
"""
67+
row_count=0
68+
forrowincsv.reader(open(file_name)):
69+
row_count+=1
70+
# Note: You could also use a generator expression
71+
# and the sum() function to count the rows:
72+
# row_count = sum(1 for row in csv.reader(open(file_name)))
73+
ifrow_limit>row_count:
74+
parser.error(
75+
"The 'row_count' of '{}' is > the number of rows in '{}'!"
76+
.format(row_limit,file_name)
11577
)
116-
current_out_writer=csv.writer(open(current_out_path,'w'),delimiter=delimiter)
117-
current_limit=rowlimit
118-
ifkeep_headers:
119-
headers=reader.next()
120-
current_out_writer.writerow(headers)
121-
fori,rowinenumerate(reader):
122-
ifi+1>current_limit:
123-
current_piece+=1
124-
current_limit=rowlimit*current_piece
125-
current_out_path=os.path.join(
126-
output_path,
127-
output_name_file
128-
)
129-
current_out_writer=csv.writer(open(current_out_path,'w'),delimiter=delimiter)
130-
131-
# elif columnindex: #split csv file accrording to unique values of certain column,it's like filter only certain item in excel
132-
# itemlist = []
133-
# columnindex = int(columnindex)
134-
# output_name_template= outputfile+'_%s.csv'
135-
# output_path='.'
136-
# keep_headers=True
137-
# delimiter=','
138-
# filehandler = open(inputfile,'r')
139-
# reader = csv.reader(filehandler, delimiter=delimiter)
140-
# if keep_headers:
141-
# headers = reader.next()
142-
143-
# for i, row in enumerate(reader):
144-
145-
# current_out_path = os.path.join(
146-
# output_path,
147-
# output_name_template % row[columnindex] )
148-
# if row[columnindex] not in itemlist:
149-
# try:
150-
# current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
151-
# except IOError:
152-
# continue
153-
# else:
154-
# itemlist.append(row[columnindex])
155-
# if keep_headers:
156-
# current_out_writer.writerow(headers)
157-
# current_out_writer.writerow(row)
158-
# else:
159-
# current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)
160-
# current_out_writer.writerow(row)
161-
# print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)
162-
# else:
163-
# print "oops, please check instruction of script by >>./csvsplit.py -h"
78+
sys.exit(1)
79+
80+
81+
defparse_file(arguments):
82+
"""
83+
Splits the CSV into multiple files or chunks based on the row_limit.
84+
Then create new CSV files.
85+
"""
86+
input_file=arguments[0]
87+
output_file=arguments[1]
88+
row_limit=arguments[2]
89+
output_path='.'# Current directory
90+
91+
# Read CSV, split into list of lists
92+
withopen(input_file,'r')asinput_csv:
93+
datareader=csv.reader(input_csv)
94+
all_rows= []
95+
forrowindatareader:
96+
all_rows.append(row)
97+
98+
# Remove header
99+
header=all_rows.pop(0)
100+
101+
# Split list of list into chunks
102+
current_chunk=0
103+
foriinrange(0,len(all_rows),row_limit):# Loop through list
104+
chunk=all_rows[i:i+row_limit]# Create single chunk
105+
106+
current_output=os.path.join(# Create new output file
107+
output_path,
108+
"{}-{}.csv".format(output_file,current_chunk)
109+
)
110+
111+
# Add header
112+
chunk.insert(0,header)
113+
114+
# Write chunk to output file
115+
withopen(current_output,'w')asoutput_csv:
116+
writer=csv.writer(output_csv)
117+
writer=writer.writerows(chunk)
118+
119+
# Output info
120+
print""
121+
print"Chunk # {}:".format(current_chunk)
122+
print"Filepath: {}".format(current_output)
123+
print"# of rows: {}".format(len(chunk))
124+
125+
# Create new chunk
126+
current_chunk+=1
164127

165128

166129
if__name__=="__main__":
167-
main(sys.argv[1:])
130+
arguments=get_arguments()
131+
parse_file(arguments)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp