Lots of changes. Fixed Coveralls, nosetests. Changed k-means clustering. Updated README.md.

sandyjmacdonald · sandyjmacdonald · commit 2e48ff95880e · 2015-10-29T17:02:46.000Z
diff --git a/.coveragerc b/.coveragerc
@@ -1,5 +1,6 @@
 [report]
 omit =
 	*/python?.?/*
+	*py?.?*
 	*/site-packages/nose/*
 	*__init__*
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 **/*.pyc
 dots_backend/db
 dots_backend/scratch.py
-sample_data_two
 build
 dist
 dots_for_microarrays.egg-info
+output
diff --git a/README.md b/README.md
@@ -231,8 +231,6 @@ You can read them in as part of your `read_experiment` call as follows:
 experiment = read_experiment(array_filenames, baseline=True, annotations_file='annotations.txt')
 ```
 
-Note that the `annotations_file` is an
-
 ## The dots_analysis module
 
 This is the meat of the dots_backend.
@@ -271,14 +269,16 @@ clustering.
 The `find_clusters` function returns a list of cluster numbers in the same order as the rows in the
 experiment data frame. If the method is hierarchical - `how=`hierarchical` - then the number of
 clusters is set at the square root of (number of rows divided by two), a good approximation. If the
-method is k-means - `how='kmeans'` - then values of k (the number of clusters) from 3 to 10 are tested
-using silhouette analysis and the best value picked. An additional argument passed to the function -
-`k_range=(3,51)` allows you to increase the number of values tested to, in this example, 50. Here's
+method is k-means - `how='kmeans'` - then values of k (the number of clusters) for a range of square
+numbers are tested (4, 9, 16, 25)
+using [silhouette analysis](https://en.wikipedia.org/wiki/Silhouette_(clustering)) and the best value
+picked. An additional argument passed to the function -
+`k_vals=range(3,51)` allows you to increase the number of values tested to, in this example, 50. Here's
 how to get a list of clusters with either hierarchical or k-means clustering:
 
 ```python
 hier_clusters = find_clusters(experiment_med.df, how='hierarchical')
-km_clusters = find_clusters(experiment_med.df, k_range=(3,11), how='kmeans')
+km_clusters = find_clusters(experiment_med.df, k_vals=range(3,11), how='kmeans')
 ```
 
 The `get_clusters` function includes the functionality of the `find_clusters`function, but first
@@ -349,7 +349,3 @@ to plot on the volcano plot, for example:
 ```python
 do_volcanoplot(experiment, ('treated', 'untreated'), show=False, image=False, html_file='volcano_plot.html')
 ```
-
-**Note that the function, `render_plot_to_png`, that generates the PNG versions of the plots requires the
-[PhantomJS](http://phantomjs.org) JavaScript API to be installed (it's essentially a headless browser) to
-work properly.**
diff --git a/dots_backend/dots_analysis.py b/dots_backend/dots_analysis.py
@@ -156,13 +156,13 @@ def run_stats(experiment):
 
 	return new_df
 
-def find_clusters(df, k_range=(3,11), how='hierarchical'):
+def find_clusters(df, k_vals=[4, 9, 16, 25], how='hierarchical'):
 	'''Find clusters, and if method is k-means run silhouette analysis
 	to determine the value of k.
 
 	Args:
 		df (data frame): A data frame with normalised expression data.
-		k_range (tuple): The range over which to test k.
+		k_vals (list or range): The range over which to test k.
 		how ('hierarchical' or 'kmeans'): Clustering method.
 
 	Returns:
@@ -184,7 +184,7 @@ def find_clusters(df, k_range=(3,11), how='hierarchical'):
 
 		## Try values of k from range and keep track of optimal k according
 		## to silhouette score.
-		for k in range(*k_range):
+		for k in k_vals:
 			km = KMeans(n_clusters=k, random_state=10)
 			clusters = km.fit_predict(df)
 			silhouette_avg = silhouette_score(df, clusters)
@@ -274,12 +274,12 @@ def get_clusters(experiment, how='hierarchical'):
 
 		## K-means clustering with silhouette analysis to determine value of k.
 		elif how == 'kmeans':
-			clusters = find_clusters(filtered_df[norm_exp_cols], k_range=(3, k_limit), how='kmeans')
+			clusters = find_clusters(filtered_df[norm_exp_cols], k_vals=range(3, k_limit), how='kmeans')
 			filtered_df['cluster'] = clusters
 
 	## Sort the data frame by cluster and mean expression across samples.
 	filtered_df['mean_norm_expression'] = filtered_df[norm_exp_cols].mean(axis=0)
-	filtered_df.sort(columns=['cluster', 'mean_norm_expression'], ascending=[True, False], inplace=True)
+	filtered_df.sort_values(by=['cluster', 'mean_norm_expression'], ascending=[True, False], inplace=True)
 	filtered_df = filtered_df.reset_index(drop=True)
 
 	return filtered_df
@@ -320,7 +320,7 @@ def keyfunc(col):
 
 	## Fix the type of the FeatureNum column and sort it.
 	merged_df['FeatureNum'] = merged_df['FeatureNum'].astype(int)
-	merged_df.sort(columns='FeatureNum', ascending=True, inplace=True)
+	merged_df.sort_values(by='FeatureNum', ascending=True, inplace=True)
 
 	## Write the table.
 	merged_df.to_csv(outfile, sep='\t', index=False)
diff --git a/dots_tests/test_dots_analysis.py b/dots_tests/test_dots_analysis.py
@@ -41,12 +41,12 @@ def test_find_clusters():
 	rows = np.random.choice(df.index.values, num_samples)
 	sampled_df = df.ix[rows]
 	hier = find_clusters(sampled_df, how='hierarchical')
-	kmeans = find_clusters(sampled_df, k_range=(3,6), how='kmeans')
+	kmeans = find_clusters(sampled_df, k_vals=range(3,6), how='kmeans')
 	assert_equals(len(hier), num_samples)
 	assert_equals(len(kmeans), num_samples)
 
 def test_get_clusters():
 	cluster_df = get_clusters(experiment)
 	assert(isinstance(cluster_df, pd.DataFrame))
 	num_cols = 9 + (2 * len(list(combinations(experiment.groups, 2)))) + len(experiment.get_sampleids())
-	assert_equals(len(cluster_df.columns.values), num_cols)
+	assert_equals(len(cluster_df.columns.values), num_cols)
diff --git a/dots_tests/test_dots_arrays.py b/dots_tests/test_dots_arrays.py
@@ -35,7 +35,7 @@
 	elif h == 'ControlType':
 		control_type_ind = i
 
-## Split the lines up and use the indices to pull out the values and add them 
+## Split the lines up and use the indices to pull out the values and add them
 ## to the values dictionary.
 for l in lines[10:]:
 	l = l.split('\t')
@@ -53,30 +53,30 @@
 array = read_array(array_fn, group, replicate)
 
 def test_probenames():
-	assert_equals(sorted(array.get_probenames()), 
+	assert_equals(sorted(array.get_probenames()),
 				  sorted([feat['ProbeName'] for feat in values.values()]))
 
 def test_genenames():
-	assert_equals(sorted(array.get_genenames()), 
+	assert_equals(sorted(array.get_genenames()),
 				  sorted([feat['GeneName'] for feat in values.values()]))
 
 def test_systematicnames():
-	assert_equals(sorted(array.get_systematicnames()), 
+	assert_equals(sorted(array.get_systematicnames()),
 				  sorted([feat['SystematicName'] for feat in values.values()]))
 
 def test_descriptions():
-	assert_equals(sorted(array.get_descriptions()), 
+	assert_equals(sorted(array.get_descriptions()),
 				  sorted([feat['Description'] for feat in values.values()]))
 
 def test_intensities():
-	assert_equals(sorted(array.get_intensities()), 
+	assert_equals(sorted(array.get_intensities()),
 				  sorted([float(feat['gProcessedSignal']) for feat in values.values()]))
 
 def test_normalisation():
 	intensities = [float(feat['gProcessedSignal']) if float(feat['gProcessedSignal']) > 1.0 else 1.0 for feat in values.values()]
 	logged = np.log2(intensities)
 	logged_shifted = logged - np.percentile(logged, 75)
-	assert_equals(sorted(array.get_normalised_intensities()), 
+	assert_equals(sorted(array.get_normalised_intensities()),
 				  sorted(logged_shifted))
 
 def test_read_array():
@@ -104,8 +104,8 @@ def test_read_experiment():
 
 def test_arrays_attribute():
 	array_exp = [array for array in experiment_2.arrays if array.sampleid == 'treated_1'][0]
-	exp_df = array_exp.df.sort(axis=1)
-	array_df = array.df.sort(axis=1)
+	exp_df = array_exp.df.sort_values(by='FeatureNum', inplace=True)
+	array_df = array.df.sort_values(by='FeatureNum', inplace=True)
 	npt.assert_array_equal(exp_df, array_df)
 	assert_equals(sorted(experiment.get_sampleids()), sorted(sampleids))
 
@@ -125,4 +125,4 @@ def test_get_sample_ids():
 	assert_equals(sorted(experiment.get_sampleids()), sorted(sampleids))
 
 def test_remove_sample():
-	assert_equals(len(experiment_2.remove_sample(array.sampleid).get_sampleids()), len(sampleids) - 1)
+	assert_equals(len(experiment_2.remove_sample(array.sampleid).get_sampleids()), len(sampleids) - 1)