This repository was archived by the owner on Jan 3, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathplot.py
More file actions
112 lines (95 loc) · 3.32 KB
/
plot.py
File metadata and controls
112 lines (95 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
'''
Plot some dataset analyses
'''
import argparse
from types import SimpleNamespace
from matplotlib import pyplot as plt
from data import DATASETS
TINY_SIZE = 12
SMALL_SIZE = 14
MEDIUM_SIZE = 18
BIGGER_SIZE = 24
def parse_args(argv=None):
''' Defines the preprocessing specific arguments '''
parser = argparse.ArgumentParser(
description='Generate dataset plots'
)
parser.add_argument(
'-D',
'--datasets',
type=str,
action='append',
choices=[d for d in DATASETS if 'parsed' in d],
help='Names of the datasets to load'
)
parser.add_argument(
'-d',
'--data-directories',
type=str,
action='append',
help='Location of the data'
)
parser.add_argument(
'-p',
'--preprocess-directories',
type=str,
action='append',
help='Location for the preprocessed data'
)
parser.add_argument(
'-s',
'--spans',
type=int,
nargs='*',
default=[2, 4, 6, 8, 10],
help='Which spans to plot for the datasets'
)
parser.add_argument(
'figure',
type=str,
help='What is the name of the figure to save'
)
return parser.parse_args(args=argv)
def main(argv=None):
''' The main entry-point to generate our dataset plots '''
dataset_spans = {}
args = parse_args(argv)
for dataset_name, data_directory, preprocess_directory in zip(
args.datasets, args.data_directories, args.preprocess_directories
):
span_list = []
for span in sorted(args.spans):
config = SimpleNamespace()
config.span = span
config.data_directory = data_directory
config.preprocess_directory = preprocess_directory
config.max_span = 0
config.max_examples = 0
config.max_line_length = 0
config.max_input_length = 0
config.max_target_length = 0
config.preprocess_buffer_size = 12500
dataset = DATASETS[dataset_name](config, split='valid').load()
span_list.append((span, dataset.stats['Constituent Spans'].average))
dataset_spans[dataset_name] = span_list
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=SMALL_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=TINY_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
# Plot dataset statistics
with plt.style.context('seaborn-colorblind'):
plt.figure(figsize=(6, 3))
plt.title('Chunk Size given k')
for dataset_name, span_list in dataset_spans.items():
spans, average_spans = zip(*span_list)
plt.plot(spans, average_spans, label=dataset_name)
plt.xlabel('k')
plt.xticks(args.spans)
plt.ylabel('Average Chunk Size')
plt.legend(loc='best', bbox_to_anchor=(0.5, 0.01, 0.5, 0.5), borderaxespad=0.)
plt.savefig(f'{args.figure}.pdf', bbox_inches='tight')
if __name__ == '__main__':
main()