Skip to content

Commit 5a4a98e

Browse files
Implement persistent IP-to-hostname mapping (#15)
* Add workflows and dependabot config * Improve hostname resolution * Add comments * fix: ensure only valid str type IP addresses are processed * trim whitespaces Co-authored-by: Copilot <[email protected]> * fix: Track unresolved IPs during reverse DNS lookups --------- Co-authored-by: Copilot <[email protected]>
1 parent 04eef8f commit 5a4a98e

File tree

1 file changed

+73
-114
lines changed

1 file changed

+73
-114
lines changed

parse.py

Lines changed: 73 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,10 @@
2121
import sys
2222
import os
2323
import glob
24-
import pathlib
2524
import platform
2625
import shutil
27-
28-
26+
import shelve
27+
import socket
2928

3029
if platform.system() == "Darwin":
3130
# Define the path to tshark within the Wireshark.app package
@@ -35,137 +34,97 @@
3534
else:
3635
sys.exit("This script requires *nix.")
3736

37+
unresolvable_ips = [] # List to keep track of unresolvable IP addresses
38+
3839
def main():
3940
# Parse the command line arguments
40-
if len(sys.argv) != 3:
41-
print("Usage: python parse.py <output_csv_file> <path_to_pcap_file_or_directory>")
42-
return
43-
44-
output_csv_file = sys.argv[1]
45-
pcap_path = sys.argv[2]
46-
47-
# Check if the path is a directory or a single file
48-
if os.path.isdir(pcap_path):
49-
pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap'))
50-
elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'):
51-
pcap_files = [pcap_path]
52-
else:
53-
print(f"No valid pcap files found at the specified path: {pcap_path}")
54-
return
55-
56-
if not pcap_files:
57-
print("No pcap files found.")
58-
return
59-
60-
# Process each pcap file and concatenate the resultant DataFrames
61-
df_list = []
62-
for pcap_file in pcap_files:
63-
print(f"Parsing pcap file: {pcap_file}")
64-
df = run_tshark(pcap_file)
65-
if df is not None:
66-
df_list.append(df)
67-
68-
if not df_list:
69-
print("Failed to parse any pcap files.")
70-
return
71-
72-
# Combine into a single DataFrame
73-
combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch')
74-
75-
# Maps IP addresses to hostnames
76-
ip_hostname_dict = {}
77-
78-
# Extract all IP -> hostname mappings from SNI fields
79-
sni_df = combined_df[
80-
combined_df['tls.handshake.extensions_server_name'].notna()
81-
]
82-
for (_, row) in sni_df.iterrows():
83-
ip = row['ip.dst']
84-
hostname = row['tls.handshake.extensions_server_name']
85-
ip_hostname_dict[ip] = hostname
86-
87-
# Extract all IP -> hostname mappings from DNS fields
88-
dns_df = combined_df[
89-
combined_df['dns.qry.name'].notna() &
90-
combined_df['dns.a'].notna()
91-
]
92-
for (_, row) in dns_df.iterrows():
93-
for ip in row['dns.a'].split(','):
94-
hostname = row['dns.qry.name']
95-
ip_hostname_dict[ip] = hostname
96-
97-
# Remove the SNI and DNS fields
98-
del combined_df['tls.handshake.extensions_server_name']
99-
del combined_df['dns.qry.name']
100-
del combined_df['dns.a']
101-
102-
# Fill in the hostnames for each IP address
103-
combined_df['src_hostname'] = combined_df['ip.src'].map(
104-
lambda x: ip_hostname_dict.get(x, None)
105-
)
106-
combined_df['dst_hostname'] = combined_df['ip.dst'].map(
107-
lambda x: ip_hostname_dict.get(x, None)
108-
)
109-
110-
# Write the results to a CSV file
111-
combined_df.to_csv(output_csv_file, index=False)
112-
113-
114-
def run_tshark(pcap_file):
41+
ip_shelve_path = 'ip_hostname_db'
42+
with shelve.open(ip_shelve_path) as ip_shelve:
43+
if len(sys.argv) != 3:
44+
print("Usage: python parse.py <output_csv_file> <path_to_pcap_file_or_directory>")
45+
return
46+
47+
output_csv_file = sys.argv[1]
48+
pcap_path = sys.argv[2]
49+
50+
if os.path.isdir(pcap_path):
51+
pcap_files = glob.glob(os.path.join(pcap_path, '*.pcap'))
52+
elif os.path.isfile(pcap_path) and pcap_path.endswith('.pcap'):
53+
pcap_files = [pcap_path]
54+
else:
55+
print(f"No valid pcap files found at the specified path: {pcap_path}")
56+
return
57+
58+
if not pcap_files:
59+
print("No pcap files found.")
60+
return
61+
# Process each pcap file and concatenate the resultant DataFrames
62+
df_list = []
63+
for pcap_file in pcap_files:
64+
df = run_tshark(pcap_file, ip_shelve)
65+
if df is not None:
66+
df_list.append(df)
67+
68+
if not df_list:
69+
print("Failed to parse any pcap files.")
70+
return
71+
72+
combined_df = pd.concat(df_list).sort_values(by='frame.time_epoch')
73+
combined_df.to_csv(output_csv_file, index=False)
74+
print(f"Output file created: {output_csv_file}")
75+
if unresolvable_ips:
76+
print("Unresolvable IP addresses:", unresolvable_ips)
77+
78+
def run_tshark(pcap_file, ip_shelve):
11579
"""
11680
Run tshark on a pcap file and return the output as a Pandas DataFrame.
11781
"""
118-
119-
# Define the fields to extract
120-
fields = [
121-
'frame.time_epoch',
122-
'eth.src', 'eth.dst',
123-
'ip.src', 'ip.dst',
124-
'tcp.srcport', 'tcp.dstport',
125-
'udp.srcport', 'udp.dstport',
126-
'_ws.col.Protocol', 'frame.len',
127-
'dns.qry.name', 'dns.a',
128-
'tls.handshake.extensions_server_name'
129-
]
130-
131-
# Create the command to run tshark
132-
command = [
133-
TSHARK_PATH,
134-
'-r', pcap_file,
135-
'-T', 'fields',
136-
'-E', 'header=y',
137-
'-E', 'separator=,',
138-
'-E', 'quote=d',
139-
'-E', 'occurrence=a',
140-
'-2',
141-
'-R', 'not tcp.analysis.retransmission'
142-
]
143-
82+
command = [TSHARK_PATH, '-r', pcap_file, '-T', 'fields', '-E', 'header=y', '-E', 'separator=,', '-E', 'quote=d', '-E', 'occurrence=a', '-2', '-R', 'not tcp.analysis.retransmission']
83+
fields = ['frame.time_epoch', 'eth.src', 'eth.dst', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol', 'frame.len', 'dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name']
14484
for field in fields:
14585
command += ['-e', field]
146-
147-
# Run the tshark command and capture the output
14886
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
14987
output, error = process.communicate()
150-
15188
if process.returncode != 0:
15289
print(f"Error running tshark on pcap file: {pcap_file}")
15390
print(error.decode())
15491
return None
155-
92+
15693
# Decode the output and read it into a Pandas DataFrame
15794
output = output.decode()
15895
data = StringIO(output)
15996
df = pd.read_csv(data, low_memory=False)
97+
update_ip_hostname_mappings(df, ip_shelve)
98+
return df
16099

161-
# Make sure the ports are integers
162-
port_columns = ['tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport']
163-
for column in port_columns:
164-
if column in df:
165-
df[column] = df[column].fillna(0).astype(int)
100+
def update_ip_hostname_mappings(df, ip_shelve):
101+
dns_df = df[df['dns.qry.name'].notna() & df['dns.a'].notna()]
102+
for _, row in dns_df.iterrows():
103+
ips = row['dns.a'].split(',')
104+
for ip in (ip.strip() for ip in ips):
105+
ip_shelve[ip] = row['dns.qry.name']
166106

167-
return df
107+
sni_df = df[df['tls.handshake.extensions_server_name'].notna()]
108+
for _, row in sni_df.iterrows():
109+
ip_shelve[row['ip.dst']] = row['tls.handshake.extensions_server_name']
168110

111+
df['src_hostname'] = df['ip.src'].map(lambda x: ip_shelve.get(str(x), reverse_dns(str(x)) if x else ''))
112+
df['dst_hostname'] = df['ip.dst'].map(lambda x: ip_shelve.get(str(x), reverse_dns(str(x)) if x else ''))
113+
df.drop(['dns.qry.name', 'dns.a', 'tls.handshake.extensions_server_name'], axis=1, inplace=True)
114+
115+
def reverse_dns(ip_address):
116+
"""
117+
Attempts to resolve an IP address to a hostname using a reverse DNS lookup;
118+
This function is used as a fallback mechanism in the event that an IP address does not have a corresponding hostname entry in the shelve database.
119+
"""
120+
if not ip_address or not isinstance(ip_address, str):
121+
return ''
122+
try:
123+
hostname = socket.gethostbyaddr(ip_address)[0]
124+
return hostname
125+
except (socket.herror, socket.gaierror):
126+
unresolvable_ips.append(ip_address)
127+
return ''
169128

170129
if __name__ == "__main__":
171130
main()

0 commit comments

Comments
 (0)