1+ import os
2+ import argparse
3+ import tarfile
4+ import json
5+ import gzip
6+
7+ from glob import glob
8+ from tqdm import tqdm
9+
10+ import multiprocessing as mp
11+
12+ import code_diff as cd
13+ from code_diff .diff_utils import parse_hunks , clean_hunk
14+
15+ def load_slcs_from_tar (tar_files ):
16+
17+ for tar_file in tar_files :
18+ tar = tarfile .open (tar_file , "r:gz" )
19+
20+ for tarinfo in tar :
21+ if not tarinfo .isfile (): continue
22+ with tar .extractfile (tarinfo ) as lines :
23+ for line in lines :
24+ yield json .loads (line .decode ("utf-8" ))
25+
26+
27+ def create_output (slc , ** kwargs ):
28+
29+ output = {
30+ "project" : slc ["project" ],
31+ "commit_sha" : slc ["commit_sha" ],
32+ "parent_sha" : slc ["parent_sha" ],
33+ "file_path" : slc ["file_path" ],
34+ "project_url" : slc ["project_url" ],
35+
36+ "likely_bug" : slc ["likely_bug" ],
37+ "comodified" : slc ["comodified" ],
38+ "in_function" : slc ["in_function" ],
39+
40+ "diff" : slc ["diff" ],
41+ }
42+
43+ output .update (kwargs )
44+ return output
45+
46+
47+ def process_slc (slc ):
48+
49+ diff_message = slc ["diff" ]
50+ diff_hunks = parse_hunks (diff_message )
51+
52+ diff_candidates = []
53+ for hunk in diff_hunks :
54+ hunk = clean_hunk (hunk )
55+
56+ try :
57+ diff = cd .difference (hunk .before , hunk .after , lang = "python" )
58+ diff_candidates .append (diff )
59+ except ValueError :
60+ continue
61+
62+ if len (diff_candidates ) != 1 : return None
63+
64+ source_diff = diff_candidates [0 ]
65+
66+ sstub_pattern = source_diff .sstub_pattern ().name
67+ edit_script = str (source_diff .edit_script ())
68+
69+ try :
70+ statement_diff = source_diff .statement_diff ()
71+ before = statement_diff .source_text
72+ after = statement_diff .target_text
73+ except ValueError :
74+ before = source_diff .source_text
75+ after = source_diff .target_text
76+
77+ return create_output (slc ,
78+ before = before ,
79+ after = after ,
80+ sstub_pattern = sstub_pattern ,
81+ edit_script = edit_script )
82+
83+
84+ # Save to jsonl.gz
85+
86+ class JsonlGzSaver :
87+
88+ def __init__ (self , save_dir , num_objects = 1e5 ):
89+ self .save_dir = save_dir
90+ self .num_objects = num_objects
91+
92+ self .object_count = 0
93+ self .file_count = 0
94+
95+ self .file_handler = None
96+ self ._update_handler ()
97+
98+ def _update_handler (self ):
99+
100+ need_update = self .file_handler is None or self .object_count >= self .num_objects
101+ if not need_update : return
102+
103+ file_path = os .path .join (self .save_dir , "file-%d.jsonl" % self .file_count )
104+
105+ if self .file_handler is not None : self .file_handler .close ()
106+
107+ self .file_handler = open (file_path , "wb" )
108+ self .file_count += 1
109+ self .object_count = 0
110+
111+ def save (self , obj ):
112+ json_obj = json .dumps (obj ) + "\n "
113+ self .file_handler .write (json_obj .encode ("utf-8" ))
114+ self .object_count += 1
115+ self ._update_handler ()
116+
117+ def close (self ):
118+ if self .file_handler is not None :
119+ self .file_handler .close ()
120+ self .file_handler = None
121+
122+ # Multiprocessing --------------------------------
123+
124+ def pmap (map_fn , data ):
125+
126+ cpu_count = mp .cpu_count ()
127+
128+ if cpu_count <= 4 : # To few CPUs for multiprocessing
129+ for output in map (map_fn , data ):
130+ yield output
131+
132+ with mp .Pool (processes = cpu_count ) as pool :
133+ for output in pool .imap_unordered (map_fn , data ):
134+ yield output
135+
136+
137+ def main ():
138+ parser = argparse .ArgumentParser ()
139+ parser .add_argument ("input_dir" )
140+ parser .add_argument ("output_dir" )
141+
142+ args = parser .parse_args ()
143+
144+ tar_files = glob (os .path .join (args .input_dir , "*.tar.gz" ))
145+
146+ slc_saver = JsonlGzSaver (args .output_dir )
147+
148+ try :
149+ process_map = pmap (process_slc , load_slcs_from_tar (tar_files ))
150+ for output in tqdm (process_map , total = 66e6 ):
151+ if output is None : continue
152+ slc_saver .save (output )
153+ finally :
154+ slc_saver .close ()
155+
156+
157+ if __name__ == '__main__' :
158+ main ()
0 commit comments