|
21 | 21 | import sys |
22 | 22 | import os |
23 | 23 | import glob |
24 | | -import pathlib |
25 | 24 | import platform |
26 | 25 | import shutil |
27 | | - |
28 | | - |
| 26 | +import shelve |
| 27 | +import socket |
29 | 28 |
|
30 | 29 | if platform.system() == "Darwin": |
31 | 30 | # Define the path to tshark within the Wireshark.app package |
|
35 | 34 | else: |
36 | 35 | sys.exit("This script requires *nix.") |
37 | 36 |
|
| 37 | +unresolvable_ips = [] # List to keep track of unresolvable IP addresses |
| 38 | + |
38 | 39 | def main(): |
39 | 40 | # Parse the command line arguments |
40 | | - if len(sys.argv) != 3: |
41 | | - print("Usage: python parse.py <output_csv_file> <path_to_pcap_file_or_directory>") |
42 | | - return |
43 | | - |
44 | | - output_csv_file = sys.argv[1] |
45 | | - pcap_path = sys.argv[2] |
46 | | - |
47 | | - # Check if the path is a directory or a single file |
48 | | - if os.path.isdir(pcap_path): |
49 | | - pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap')) |
50 | | - elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'): |
51 | | - pcap_files = [pcap_path] |
52 | | - else: |
53 | | - print(f"No valid pcap files found at the specified path: {pcap_path}") |
54 | | - return |
55 | | - |
56 | | - if not pcap_files: |
57 | | - print("No pcap files found.") |
58 | | - return |
59 | | - |
60 | | - # Process each pcap file and concatenate the resultant DataFrames |
61 | | - df_list = [] |
62 | | - for pcap_file in pcap_files: |
63 | | - print(f"Parsing pcap file: {pcap_file}") |
64 | | - df = run_tshark(pcap_file) |
65 | | - if df is not None: |
66 | | - df_list.append(df) |
67 | | - |
68 | | - if not df_list: |
69 | | - print("Failed to parse any pcap files.") |
70 | | - return |
71 | | - |
72 | | - # Combine into a single DataFrame |
73 | | - combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch') |
74 | | - |
75 | | - # Maps IP addresses to hostnames |
76 | | - ip_hostname_dict = {} |
77 | | - |
78 | | - # Extract all IP -> hostname mappings from SNI fields |
79 | | - sni_df = combined_df[ |
80 | | - combined_df['tls.handshake.extensions_server_name'].notna() |
81 | | - ] |
82 | | - for (_, row) in sni_df.iterrows(): |
83 | | - ip = row['ip.dst'] |
84 | | - hostname = row['tls.handshake.extensions_server_name'] |
85 | | - ip_hostname_dict[ip] = hostname |
86 | | - |
87 | | - # Extract all IP -> hostname mappings from DNS fields |
88 | | - dns_df = combined_df[ |
89 | | - combined_df['dns.qry.name'].notna() & |
90 | | - combined_df['dns.a'].notna() |
91 | | - ] |
92 | | - for (_, row) in dns_df.iterrows(): |
93 | | - for ip in row['dns.a'].split(','): |
94 | | - hostname = row['dns.qry.name'] |
95 | | - ip_hostname_dict[ip] = hostname |
96 | | - |
97 | | - # Remove the SNI and DNS fields |
98 | | - del combined_df['tls.handshake.extensions_server_name'] |
99 | | - del combined_df['dns.qry.name'] |
100 | | - del combined_df['dns.a'] |
101 | | - |
102 | | - # Fill in the hostnames for each IP address |
103 | | - combined_df['src_hostname'] = combined_df['ip.src'].map( |
104 | | - lambda x: ip_hostname_dict.get(x, None) |
105 | | - ) |
106 | | - combined_df['dst_hostname'] = combined_df['ip.dst'].map( |
107 | | - lambda x: ip_hostname_dict.get(x, None) |
108 | | - ) |
109 | | - |
110 | | - # Write the results to a CSV file |
111 | | - combined_df.to_csv(output_csv_file, index=False) |
112 | | - |
113 | | - |
114 | | -def run_tshark(pcap_file): |
| 41 | + ip_shelve_path = 'ip_hostname_db' |
| 42 | + with shelve.open(ip_shelve_path) as ip_shelve: |
| 43 | + if len(sys.argv) != 3: |
| 44 | + print("Usage: python parse.py <output_csv_file> <path_to_pcap_file_or_directory>") |
| 45 | + return |
| 46 | + |
| 47 | + output_csv_file = sys.argv[1] |
| 48 | + pcap_path = sys.argv[2] |
| 49 | + |
| 50 | + if os.path.isdir(pcap_path): |
| 51 | + pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap')) |
| 52 | + elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'): |
| 53 | + pcap_files = [pcap_path] |
| 54 | + else: |
| 55 | + print(f"No valid pcap files found at the specified path: {pcap_path}") |
| 56 | + return |
| 57 | + |
| 58 | + if not pcap_files: |
| 59 | + print("No pcap files found.") |
| 60 | + return |
| 61 | + # Process each pcap file and concatenate the resultant DataFrames |
| 62 | + df_list = [] |
| 63 | + for pcap_file in pcap_files: |
| 64 | + df = run_tshark(pcap_file, ip_shelve) |
| 65 | + if df is not None: |
| 66 | + df_list.append(df) |
| 67 | + |
| 68 | + if not df_list: |
| 69 | + print("Failed to parse any pcap files.") |
| 70 | + return |
| 71 | + |
| 72 | + combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch') |
| 73 | + combined_df.to_csv(output_csv_file, index=False) |
| 74 | + print(f"Output file created: {output_csv_file}") |
| 75 | + if unresolvable_ips: |
| 76 | + print("Unresolvable IP addresses:", unresolvable_ips) |
| 77 | + |
| 78 | +def run_tshark(pcap_file, ip_shelve): |
115 | 79 | """ |
116 | 80 | Run tshark on a pcap file and return the output as a Pandas DataFrame. |
117 | 81 | """ |
118 | | - |
119 | | - # Define the fields to extract |
120 | | - fields = [ |
121 | | - 'frame.time_epoch', |
122 | | - 'eth.src', 'eth.dst', |
123 | | - 'ip.src', 'ip.dst', |
124 | | - 'tcp.srcport', 'tcp.dstport', |
125 | | - 'udp.srcport', 'udp.dstport', |
126 | | - '_ws.col.Protocol', 'frame.len', |
127 | | - 'dns.qry.name', 'dns.a', |
128 | | - 'tls.handshake.extensions_server_name' |
129 | | - ] |
130 | | - |
131 | | - # Create the command to run tshark |
132 | | - command = [ |
133 | | - TSHARK_PATH, |
134 | | - '-r', pcap_file, |
135 | | - '-T', 'fields', |
136 | | - '-E', 'header=y', |
137 | | - '-E', 'separator=,', |
138 | | - '-E', 'quote=d', |
139 | | - '-E', 'occurrence=a', |
140 | | - '-2', |
141 | | - '-R', 'not tcp.analysis.retransmission' |
142 | | - ] |
143 | | - |
| 82 | + command = [TSHARK_PATH, '-r', pcap_file, '-T', 'fields', '-E', 'header=y', '-E', 'separator=,', '-E', 'quote=d', '-E', 'occurrence=a', '-2', '-R', 'not tcp.analysis.retransmission'] |
| 83 | + fields = ['frame.time_epoch', 'eth.src', 'eth.dst', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol', 'frame.len', 'dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'] |
144 | 84 | for field in fields: |
145 | 85 | command += ['-e', field] |
146 | | - |
147 | | - # Run the tshark command and capture the output |
148 | 86 | process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
149 | 87 | output, error = process.communicate() |
150 | | - |
151 | 88 | if process.returncode != 0: |
152 | 89 | print(f"Error running tshark on pcap file: {pcap_file}") |
153 | 90 | print(error.decode()) |
154 | 91 | return None |
155 | | - |
| 92 | + |
156 | 93 | # Decode the output and read it into a Pandas DataFrame |
157 | 94 | output = output.decode() |
158 | 95 | data = StringIO(output) |
159 | 96 | df = pd.read_csv(data, low_memory=False) |
| 97 | + update_ip_hostname_mappings(df, ip_shelve) |
| 98 | + return df |
160 | 99 |
|
161 | | - # Make sure the ports are integers |
162 | | - port_columns = ['tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport'] |
163 | | - for column in port_columns: |
164 | | - if column in df: |
165 | | - df[column] = df[column].fillna(0).astype(int) |
| 100 | +def update_ip_hostname_mappings(df, ip_shelve): |
| 101 | + dns_df = df[df['dns.qry.name'].notna() & df['dns.a'].notna()] |
| 102 | + for _, row in dns_df.iterrows(): |
| 103 | + ips = row['dns.a'].split(',') |
| 104 | + for ip in (ip.strip() for ip in ips): |
| 105 | + ip_shelve[ip] = row['dns.qry.name'] |
166 | 106 |
|
167 | | - return df |
| 107 | + sni_df = df[df['tls.handshake.extensions_server_name'].notna()] |
| 108 | + for _, row in sni_df.iterrows(): |
| 109 | + ip_shelve[row['ip.dst']] = row['tls.handshake.extensions_server_name'] |
168 | 110 |
|
| 111 | + df['src_hostname'] = df['ip.src'].map(lambda x: ip_shelve.get(str(x), reverse_dns(str(x)) if x else '')) |
| 112 | + df['dst_hostname'] = df['ip.dst'].map(lambda x: ip_shelve.get(str(x), reverse_dns(str(x)) if x else '')) |
| 113 | + df.drop(['dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'], axis=1, inplace=True) |
| 114 | + |
| 115 | +def reverse_dns(ip_address): |
| 116 | + """ |
| 117 | + Attempts to resolve an IP address to a hostname using a reverse DNS lookup; |
| 118 | + This function is used as a fallback mechanism in the event that an IP address does not have a corresponding hostname entry in the shelve database. |
| 119 | + """ |
| 120 | + if not ip_address or not isinstance(ip_address, str): |
| 121 | + return '' |
| 122 | + try: |
| 123 | + hostname = socket.gethostbyaddr(ip_address)[0] |
| 124 | + return hostname |
| 125 | + except (socket.herror, socket.gaierror): |
| 126 | + unresolvable_ips.append(ip_address) |
| 127 | + return '' |
169 | 128 |
|
170 | 129 | if __name__ == "__main__": |
171 | 130 | main() |
0 commit comments