Skip to content

Commit 2e48ff9

Browse files
Lots of changes. Fixed Coveralls, nosetests. Changed k-means clustering. Updated README.md.
1 parent e9dbc95 commit 2e48ff9

File tree

6 files changed

+26
-29
lines changed

6 files changed

+26
-29
lines changed

.coveragerc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
[report]
22
omit =
33
*/python?.?/*
4+
*py?.?*
45
*/site-packages/nose/*
56
*__init__*

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
**/*.pyc
33
dots_backend/db
44
dots_backend/scratch.py
5-
sample_data_two
65
build
76
dist
87
dots_for_microarrays.egg-info
8+
output

README.md

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,6 @@ You can read them in as part of your `read_experiment` call as follows:
231231
experiment = read_experiment(array_filenames, baseline=True, annotations_file='annotations.txt')
232232
```
233233

234-
Note that the `annotations_file` is an
235-
236234
## The dots_analysis module
237235

238236
This is the meat of the dots_backend.
@@ -271,14 +269,16 @@ clustering.
271269
The `find_clusters` function returns a list of cluster numbers in the same order as the rows in the
272270
experiment data frame. If the method is hierarchical - `how=`hierarchical` - then the number of
273271
clusters is set at the square root of (number of rows divided by two), a good approximation. If the
274-
method is k-means - `how='kmeans'` - then values of k (the number of clusters) from 3 to 10 are tested
275-
using silhouette analysis and the best value picked. An additional argument passed to the function -
276-
`k_range=(3,51)` allows you to increase the number of values tested to, in this example, 50. Here's
272+
method is k-means - `how='kmeans'` - then values of k (the number of clusters) for a range of square
273+
numbers are tested (4, 9, 16, 25)
274+
using [silhouette analysis](https://en.wikipedia.org/wiki/Silhouette_(clustering)) and the best value
275+
picked. An additional argument passed to the function -
276+
`k_vals=range(3,51)` allows you to increase the number of values tested to, in this example, 50. Here's
277277
how to get a list of clusters with either hierarchical or k-means clustering:
278278

279279
```python
280280
hier_clusters = find_clusters(experiment_med.df, how='hierarchical')
281-
km_clusters = find_clusters(experiment_med.df, k_range=(3,11), how='kmeans')
281+
km_clusters = find_clusters(experiment_med.df, k_vals=range(3,11), how='kmeans')
282282
```
283283

284284
The `get_clusters` function includes the functionality of the `find_clusters`function, but first
@@ -349,7 +349,3 @@ to plot on the volcano plot, for example:
349349
```python
350350
do_volcanoplot(experiment, ('treated', 'untreated'), show=False, image=False, html_file='volcano_plot.html')
351351
```
352-
353-
**Note that the function, `render_plot_to_png`, that generates the PNG versions of the plots requires the
354-
[PhantomJS](http://phantomjs.org) JavaScript API to be installed (it's essentially a headless browser) to
355-
work properly.**

dots_backend/dots_analysis.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -156,13 +156,13 @@ def run_stats(experiment):
156156

157157
return new_df
158158

159-
def find_clusters(df, k_range=(3,11), how='hierarchical'):
159+
def find_clusters(df, k_vals=[4, 9, 16, 25], how='hierarchical'):
160160
'''Find clusters, and if method is k-means run silhouette analysis
161161
to determine the value of k.
162162
163163
Args:
164164
df (data frame): A data frame with normalised expression data.
165-
k_range (tuple): The range over which to test k.
165+
k_vals (list or range): The range over which to test k.
166166
how ('hierarchical' or 'kmeans'): Clustering method.
167167
168168
Returns:
@@ -184,7 +184,7 @@ def find_clusters(df, k_range=(3,11), how='hierarchical'):
184184

185185
## Try values of k from range and keep track of optimal k according
186186
## to silhouette score.
187-
for k in range(*k_range):
187+
for k in k_vals:
188188
km = KMeans(n_clusters=k, random_state=10)
189189
clusters = km.fit_predict(df)
190190
silhouette_avg = silhouette_score(df, clusters)
@@ -274,12 +274,12 @@ def get_clusters(experiment, how='hierarchical'):
274274

275275
## K-means clustering with silhouette analysis to determine value of k.
276276
elif how == 'kmeans':
277-
clusters = find_clusters(filtered_df[norm_exp_cols], k_range=(3, k_limit), how='kmeans')
277+
clusters = find_clusters(filtered_df[norm_exp_cols], k_vals=range(3, k_limit), how='kmeans')
278278
filtered_df['cluster'] = clusters
279279

280280
## Sort the data frame by cluster and mean expression across samples.
281281
filtered_df['mean_norm_expression'] = filtered_df[norm_exp_cols].mean(axis=0)
282-
filtered_df.sort(columns=['cluster', 'mean_norm_expression'], ascending=[True, False], inplace=True)
282+
filtered_df.sort_values(by=['cluster', 'mean_norm_expression'], ascending=[True, False], inplace=True)
283283
filtered_df = filtered_df.reset_index(drop=True)
284284

285285
return filtered_df
@@ -320,7 +320,7 @@ def keyfunc(col):
320320

321321
## Fix the type of the FeatureNum column and sort it.
322322
merged_df['FeatureNum'] = merged_df['FeatureNum'].astype(int)
323-
merged_df.sort(columns='FeatureNum', ascending=True, inplace=True)
323+
merged_df.sort_values(by='FeatureNum', ascending=True, inplace=True)
324324

325325
## Write the table.
326326
merged_df.to_csv(outfile, sep='\t', index=False)

dots_tests/test_dots_analysis.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ def test_find_clusters():
4141
rows = np.random.choice(df.index.values, num_samples)
4242
sampled_df = df.ix[rows]
4343
hier = find_clusters(sampled_df, how='hierarchical')
44-
kmeans = find_clusters(sampled_df, k_range=(3,6), how='kmeans')
44+
kmeans = find_clusters(sampled_df, k_vals=range(3,6), how='kmeans')
4545
assert_equals(len(hier), num_samples)
4646
assert_equals(len(kmeans), num_samples)
4747

4848
def test_get_clusters():
4949
cluster_df = get_clusters(experiment)
5050
assert(isinstance(cluster_df, pd.DataFrame))
5151
num_cols = 9 + (2 * len(list(combinations(experiment.groups, 2)))) + len(experiment.get_sampleids())
52-
assert_equals(len(cluster_df.columns.values), num_cols)
52+
assert_equals(len(cluster_df.columns.values), num_cols)

dots_tests/test_dots_arrays.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
elif h == 'ControlType':
3636
control_type_ind = i
3737

38-
## Split the lines up and use the indices to pull out the values and add them
38+
## Split the lines up and use the indices to pull out the values and add them
3939
## to the values dictionary.
4040
for l in lines[10:]:
4141
l = l.split('\t')
@@ -53,30 +53,30 @@
5353
array = read_array(array_fn, group, replicate)
5454

5555
def test_probenames():
56-
assert_equals(sorted(array.get_probenames()),
56+
assert_equals(sorted(array.get_probenames()),
5757
sorted([feat['ProbeName'] for feat in values.values()]))
5858

5959
def test_genenames():
60-
assert_equals(sorted(array.get_genenames()),
60+
assert_equals(sorted(array.get_genenames()),
6161
sorted([feat['GeneName'] for feat in values.values()]))
6262

6363
def test_systematicnames():
64-
assert_equals(sorted(array.get_systematicnames()),
64+
assert_equals(sorted(array.get_systematicnames()),
6565
sorted([feat['SystematicName'] for feat in values.values()]))
6666

6767
def test_descriptions():
68-
assert_equals(sorted(array.get_descriptions()),
68+
assert_equals(sorted(array.get_descriptions()),
6969
sorted([feat['Description'] for feat in values.values()]))
7070

7171
def test_intensities():
72-
assert_equals(sorted(array.get_intensities()),
72+
assert_equals(sorted(array.get_intensities()),
7373
sorted([float(feat['gProcessedSignal']) for feat in values.values()]))
7474

7575
def test_normalisation():
7676
intensities = [float(feat['gProcessedSignal']) if float(feat['gProcessedSignal']) > 1.0 else 1.0 for feat in values.values()]
7777
logged = np.log2(intensities)
7878
logged_shifted = logged - np.percentile(logged, 75)
79-
assert_equals(sorted(array.get_normalised_intensities()),
79+
assert_equals(sorted(array.get_normalised_intensities()),
8080
sorted(logged_shifted))
8181

8282
def test_read_array():
@@ -104,8 +104,8 @@ def test_read_experiment():
104104

105105
def test_arrays_attribute():
106106
array_exp = [array for array in experiment_2.arrays if array.sampleid == 'treated_1'][0]
107-
exp_df = array_exp.df.sort(axis=1)
108-
array_df = array.df.sort(axis=1)
107+
exp_df = array_exp.df.sort_values(by='FeatureNum', inplace=True)
108+
array_df = array.df.sort_values(by='FeatureNum', inplace=True)
109109
npt.assert_array_equal(exp_df, array_df)
110110
assert_equals(sorted(experiment.get_sampleids()), sorted(sampleids))
111111

@@ -125,4 +125,4 @@ def test_get_sample_ids():
125125
assert_equals(sorted(experiment.get_sampleids()), sorted(sampleids))
126126

127127
def test_remove_sample():
128-
assert_equals(len(experiment_2.remove_sample(array.sampleid).get_sampleids()), len(sampleids) - 1)
128+
assert_equals(len(experiment_2.remove_sample(array.sampleid).get_sampleids()), len(sampleids) - 1)

0 commit comments

Comments
 (0)