@@ -9,52 +9,29 @@ public class _609 {
99
1010public static class Solution1 {
1111public List <List <String >>findDuplicate (String []paths ) {
12- Map <String ,List <String >>map =new HashMap <>();//key is the file content, value is the list of directories that has this directory/file
12+ Map <String ,List <String >>contentMap =new HashMap <>();
1313for (String path :paths ) {
14- String []dirAndFiles =path .split (" " );
15- for (int i =1 ;i <dirAndFiles .length ;i ++) {
16- String content =dirAndFiles [i ].substring (dirAndFiles [i ].indexOf ("(" ) +1 ,dirAndFiles [i ].indexOf (")" ));
17- if (!map .containsKey (content )) {
18- map .put (content ,new ArrayList <>());
14+ String []contents =path .split (" " );
15+ List <String >list =new ArrayList <>();
16+ for (int i =1 ;i <contents .length ;i ++) {
17+ list .add (contents [i ]);
18+ int start =contents [i ].indexOf ('(' );
19+ int end =contents [i ].indexOf (')' );
20+ String content =contents [i ].substring (start +1 ,end );
21+ if (!contentMap .containsKey (content )) {
22+ contentMap .put (content ,new ArrayList <>());
1923 }
20- List <String >dirs =map .get (content );
21- dirs .add (dirAndFiles [0 ] +"/" +dirAndFiles [i ].substring (0 ,dirAndFiles [i ].indexOf ("(" )));
22- map .put (content ,dirs );
24+ List <String >dupFiles =contentMap .get (content );
25+ dupFiles .add (contents [0 ] +"/" +contents [i ].substring (0 ,start ));
2326 }
2427 }
25-
2628List <List <String >>result =new ArrayList <>();
27- for (String content :map .keySet ()) {
28- if (map .get (content ).size () >1 ) {
29- List <String >dupFile =new ArrayList <>();
30- for (String dir :map .get (content )) {
31- dupFile .add (dir );
32- }
33- result .add (dupFile );
29+ for (String key :contentMap .keySet ()) {
30+ if (contentMap .get (key ).size () >1 ) {
31+ result .add (contentMap .get (key ));
3432 }
3533 }
3634return result ;
3735 }
3836 }
39-
40- /**Answers to follow-up questions:
41- * 1. Imagine you are given a real file system, how will you search files? DFS or BFS ?
42- * A: Both BFS and DFS could do the work, but BFS will use extra memory, however, BFS takes advantage of memory locality, so BFS could be faster.
43- *
44- * 2. If the file content is very large (GB level), how will you modify your solution?
45- * A: We'll fist map all files according to their sizes, since files with different sizes are guaranteed to be different, then
46- * we can hash a small part of the files using MD5, SHA256, etc. Only when their md5 or sha256 is the same, we'll compare the contents byte by byte.
47- *
48- * 3. If you can only read the file by 1kb each time, how will you modify your solution?
49- * A: This is not going to change the solution, we can hash this 1kb chunk, and then also only compare byte by byte when it's necessary.
50- *
51- * 4. What is the time complexity of your modified solution? What is the most time consuming part and memory consuming part of it? How to optimize?
52- * A: O(n^2*k), in the worst time, we'll have to compare the file with every other file, k is the length of the file.
53- * Comparing the file (by size, by hash and eventually byte by byte) is the most time consuming part.
54- * Generating hash for every file will be the most memory consuming part.
55- *
56- * 5. How to make sure the duplicated files you find are not false positive?
57- * A: Size comparision, hash detection, byte by byte check, etc. will pretty sure to rule out false positive.
58- * */
59-
6037}