diff --git a/__pycache__/gvmagic.cpython-39.pyc b/__pycache__/gvmagic.cpython-39.pyc new file mode 100644 index 00000000..b5a0342e Binary files /dev/null and b/__pycache__/gvmagic.cpython-39.pyc differ diff --git a/__pycache__/utils.cpython-39.pyc b/__pycache__/utils.cpython-39.pyc new file mode 100644 index 00000000..ff6c2ce1 Binary files /dev/null and b/__pycache__/utils.cpython-39.pyc differ diff --git a/covid_overlap.txt b/covid_overlap.txt new file mode 100644 index 00000000..9bca9022 --- /dev/null +++ b/covid_overlap.txt @@ -0,0 +1 @@ +AGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAACCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGAACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAGAAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCTATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTGGATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTCTCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCTTTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAATTTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA \ No newline at end of file diff --git a/gbs2gps_wjw8jw.ipynb b/gbs2gps_wjw8jw.ipynb new file mode 100644 index 00000000..8ab6825b --- /dev/null +++ b/gbs2gps_wjw8jw.ipynb @@ -0,0 +1,716 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4c3befae", + "metadata": {}, + "source": [ + "# Project 1: Assembling Genes" + ] + }, + { + "cell_type": "markdown", + "id": "f98e2125", + "metadata": {}, + "source": [ + "
\n", + "
Due: Monday, September 5, 8:59pm.
\n", + "
\n", + " \n", + "
\n", + "
\n", + " Collaboration and Resource Policy\n", + "
\n", + " \n", + "For this assignment, you are encouraged to work with one other person satisfying the constraints from Class 2. \n", + "You are permitted (actually _encouraged_) to discuss these problems with anyone you want, including other students in the class. If you do discuss the specific questions in the assignment with anyone other than your assignment partner and the course staff, though, you should list them in the _External resources used_ section below.\n", + " \n", + "You are welcome to use any resources you want for this assignment, other than ones that would defeat the purpose of the assignment. This means you should not look at answers or code from previous semesters of this course, or from any other students in the class (other than your collaboration with your partner), and if you find code that implements the problem you are being asked to do for the assignment, you should not use that code. You should document all external resource you use that are not part of the course materials in the _External resources used_ section below.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "b149549a", + "metadata": {}, + "source": [ + "**Team submitting this assignment:** \n", + "
\n", + " list each member of your team here, including both your name and UVA computing id\n", + "\n", + "Team Members (Names): Genna Schwarz and Jackson Wallace\n", + "\n", + "Team Member UVA Computing IDs: gbs2gps and wjw8jw\n", + "\n", + "
\n", + "\n", + "**External resources used:** \n", + "
\n", + "It is not necessary to list the course materials, but if you used any other resources, including discussing problems with students not on your team, list them here.\n", + " \n", + "External Resources Used: \n", + "\n", + "https://stackoverflow.com/questions/46480309/de-bruijn-graph-code-python-3\n", + "\n", + "https://towardsdatascience.com/genome-assembly-using-de-bruijn-graphs-69570efcc270\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "dc10576d", + "metadata": {}, + "source": [ + "In this project, we will explore genome assembly—the process of determining the order of nucleotides in DNA from fragmented reads. As you might have studied in the reading assignments, genome assembly can get quite complicated, as problems such as full sequence coverage, finding a good length for reads (the $k$ in $k$-mer), and sequencing errors present challenges for sequencing analysis and accuracy. You can assume perfect coverage for all parts of the assignment and no read errors for the first two questions.\n", + "\n", + "\n", + "Submission: Please submit the code you wrote to generate your answers for all parts using this form: https://forms.gle/rNTXfYojTLEQ8idg6. Your answers should be in the Jupyter Notebook, along with your code. Before submission, you should make a copy of your notebook file with the name uvaid1\\_uvaid2.ipynb (where uvaidn is each teammates UVA id) so the submitted file identifies you. You and your partner should submit a single file once together. Submission is due 8:59 pm (EST) on Monday, September 5." + ] + }, + { + "cell_type": "markdown", + "id": "d64f29ed", + "metadata": {}, + "source": [ + "## Install basic required packages." + ] + }, + { + "cell_type": "markdown", + "id": "0d00d3f2", + "metadata": {}, + "source": [ + "- Install basic required packages, should be run only once. You may need to restart the kernel after this stage.\n", + "- Make sure you have [graphviz](https://graphviz.org/download/) installed on your system.\n", + "- The second cell adds Graphviz to your path, you may have to change based on where the install folder is.\n", + "\n", + "NOTE: We provide utils.py, which may contain helpful functions for you to use, as well as gvmagic.py, which is a deprecated package to use graphviz within the notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "47d490f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: numpy in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from -r requirements.txt (line 1)) (1.20.3)\n", + "Requirement already satisfied: matplotlib in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from -r requirements.txt (line 2)) (3.5.1)\n", + "Requirement already satisfied: pydot in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from -r requirements.txt (line 3)) (1.4.2)\n", + "Requirement already satisfied: graphviz in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from -r requirements.txt (line 4)) (0.20.1)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib->-r requirements.txt (line 2)) (0.11.0)\n", + "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib->-r requirements.txt (line 2)) (2.8.2)\n", + "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib->-r requirements.txt (line 2)) (9.0.1)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib->-r requirements.txt (line 2)) (3.0.6)\n", + "Requirement already satisfied: packaging>=20.0 in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib->-r requirements.txt (line 2)) (21.3)\n", + "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib->-r requirements.txt (line 2)) (4.29.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib->-r requirements.txt (line 2)) (1.3.2)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\jacks\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from python-dateutil>=2.7->matplotlib->-r requirements.txt (line 2)) (1.16.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: You are using pip version 21.3.1; however, version 22.2.2 is available.\n", + "You should consider upgrading via the 'c:\\Users\\jacks\\AppData\\Local\\Programs\\Python\\Python39\\python.exe -m pip install --upgrade pip' command.\n" + ] + } + ], + "source": [ + "%pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "4bbc8cb8", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"PATH\"] += os.pathsep + 'C:/Program Files/Graphviz/bin'" + ] + }, + { + "cell_type": "markdown", + "id": "031e1f5a", + "metadata": {}, + "source": [ + "## Genome Assembly\n", + "\n", + "For this part, you're given reads generated while trying to sequence the DNA of a TeleTubby (some unknown organism) with a \\textit{very} small genetic code. By answering the following questions, you will learn how to assemble the original genome sequence from sequence reads.\n", + "\n", + "Sequencing data is often stored in FASTQ file format. In TeleTubby.fastq, you will find the data organized in a particular order that repeats every four lines. The first line contains the metadata that encodes the name of the read, the experiment type, the kind of sequencing machine used, etc. The second line is the sequence of bases. The third line functions as a placeholder line. The fourth line is a sequence of base qualities that encode the qualities for the corresponding bases in the sequence line. We will only work with the sequence and quality score lines in this question." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "16dd096f", + "metadata": {}, + "outputs": [], + "source": [ + "import collections\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import utils\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "cdb93d49", + "metadata": {}, + "source": [ + "#### Question 1.1.1 GC-content\n", + "\n", + "The GC-content (or the ratio of G and C nucleotides) is related to the melting temperature of the DNA double helix. Use the following equation to calculate the melting temperature of DNA for TeleTubby $t_m$ in Celsius:\n", + "\n", + "\\begin{equation*}\n", + "t_m = 64.9+0.41(\\%GC)-\\frac{500}{\\text{length of sequence}}\n", + "\\end{equation*}\n", + "\n", + "As a reference, the human genome is known to have between 35%-60% GC-content. " + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "98e17e2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "%GC content: 47.95221843003413\n", + "Temperature in celsius: 64.88329351535837\n" + ] + } + ], + "source": [ + "# Read sequence reads (error-free) from file\n", + "sequence_reads, qualities = utils.read_fastq('TeleTubby.fastq')\n", + "# print(sequence_reads)\n", + "sequence_len = len(sequence_reads) * 8\n", + "\n", + "# Calculate %GC content\n", + "chars = []\n", + "for i in range(len(sequence_reads)):\n", + " seq = sequence_reads[i]\n", + " for j in seq:\n", + " chars.append(j)\n", + " \n", + "GC_percent = (chars.count('G') + chars.count('C')) / sequence_len\n", + "print(\"%GC content:\", GC_percent * 100)\n", + "\n", + "# Print out temperature in Celsius\n", + "temp_c = 64.9 + (0.41 * GC_percent) - (500 / sequence_len)\n", + "print(\"Temperature in celsius:\", temp_c)" + ] + }, + { + "cell_type": "markdown", + "id": "14adb12e", + "metadata": {}, + "source": [ + "#### Question 1.1.2 Interpreting quality scores" + ] + }, + { + "cell_type": "markdown", + "id": "24c98e15", + "metadata": {}, + "source": [ + "Phred33 quality scores are represented as the character with an ASCII code equal to its value + 33 (to make them easy to print alongside genome sequences). List the top 5 most frequent scores in ASCII symbol as well as their Phredd33 scores in TeleTubby.fastq. You can refer to the [official Illumina website](https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/QualityScoreEncoding_swBS.htm) to reference the scoring encoding.\n", + "\n", + "What is the average Phred33 score in TeleTubby.fastq?" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "106c8af4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ASCII Symbol Top 5: ['5', '?', 'D', 'K', 'F']\n", + "Phredd33 Top 5: [20, 30, 35, 42, 37]\n", + "Average Phred33 Score 34.5\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "# Calculate and print average Phred33 score\n", + "\n", + "listQualities = ''.join(qualities)\n", + "\n", + "# sorting on bais of frequency of elements\n", + "counter = Counter(listQualities)\n", + "\n", + "topSymb = sorted(counter, key=counter.get, reverse=True)[:5]\n", + "print('ASCII Symbol Top 5:', topSymb)\n", + "topPhred = [ord(each) - 33 for each in topSymb]\n", + "print('Phredd33 Top 5:', topPhred)\n", + "average = np.sum([ord(each) - 33 for each in counter])/len(counter)\n", + "print('Average Phred33 Score', str(average))" + ] + }, + { + "cell_type": "markdown", + "id": "71fca337", + "metadata": {}, + "source": [ + "#### Question 1.1.3 Frequency analysis\n", + "\n", + "Looking at repetitions in the sequence can be helpful in estimating the \"redudancy\" in the organisms. Humand and other evolved animals have a lot of redundancy, while smaller organisms like bacteria have highly packed genomes. One heuristic to estimate this before actually performing the assembly could be looking at how often certain $k$-mers are repeated.\n", + "\n", + "Print out the 3 most frequent k-mers with their frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "ae436ba6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "K-mer: GCTATCGC\n", + "Frequency of GCTATCGC: 3\n", + "K-mer: TATCGCAA\n", + "Frequency of TATCGCAA: 2\n", + "K-mer: CTATCGCA\n", + "Frequency of CTATCGCA: 2\n" + ] + } + ], + "source": [ + "# Find and print out the three most repeated k-mers and their frequencies\n", + "sequence_reads, quals = utils.read_fastq('TeleTubby.fastq')\n", + "\n", + "dictionary = {}\n", + "\n", + "for read in sequence_reads:\n", + " if read in dictionary:\n", + " dictionary[read] += 1\n", + " else:\n", + " dictionary[read] = 1\n", + " \n", + "counter = 3\n", + "while counter > 0:\n", + " name = \"\"\n", + " frequency = 0\n", + " for kmer in dictionary.keys():\n", + " if dictionary[kmer] > frequency:\n", + " name = kmer\n", + " frequency = dictionary[kmer]\n", + " dictionary[kmer] = -1\n", + " counter -= 1 \n", + " print(\"K-mer:\", name)\n", + " print(\"Frequency of\", name + ':', frequency)\n" + ] + }, + { + "cell_type": "markdown", + "id": "0213a496", + "metadata": {}, + "source": [ + "### Question 1.2. Greedy approach" + ] + }, + { + "cell_type": "markdown", + "id": "4dc055d5", + "metadata": {}, + "source": [ + "One of the approaches to assemble the genome from the given reads is a greedy algorithm. Have a look at the greedy algorithm described on [Wikipedia](https://en.wikipedia.org/wiki/Sequence_assembly#Greedy_algorithm) and answer the following." + ] + }, + { + "cell_type": "markdown", + "id": "fbddd3d7", + "metadata": {}, + "source": [ + "#### Question 1.2.1 What would the runtime be of this algorithm, given $n$ $k$-mer reads?" + ] + }, + { + "cell_type": "markdown", + "id": "fb26aec0", + "metadata": {}, + "source": [ + "Answer: O(k*n^2)" + ] + }, + { + "cell_type": "markdown", + "id": "c9c6d7d7", + "metadata": {}, + "source": [ + "#### Question 1.2.2 Would this algorithm always yield a unique solution?" + ] + }, + { + "cell_type": "markdown", + "id": "528b95a2", + "metadata": {}, + "source": [ + "Answer: This algorithm would yield the same solution if it selected the same starting k-mer each time. Otherwise, it would yield different results stemming from local greedy choices. " + ] + }, + { + "cell_type": "markdown", + "id": "217cb44f", + "metadata": {}, + "source": [ + "#### Question 1.2.3 Would this algorithm always yield the right solution?" + ] + }, + { + "cell_type": "markdown", + "id": "62f3cbbb", + "metadata": {}, + "source": [ + "Answer: No, this algorithm has no understanding of the global problem. It has no way of guaranteeing that the overall solution is correct even though it can guarantee the correct solutions to the overall solution's subproblems." + ] + }, + { + "cell_type": "markdown", + "id": "b97ed918", + "metadata": {}, + "source": [ + "### Question 1.3 Graph-based approaches" + ] + }, + { + "cell_type": "markdown", + "id": "12e8f8a0", + "metadata": {}, + "source": [ + "Graphs for genome assembly can be constructed in two ways:\n", + "\n", + "- de Bruijn graph: Processing $k-$mers as nodes, with $(k-1)-$mers as edges, and\n", + "- Overlap graph: Processing $k-$mers as edges, with $(k-1)-$mers as nodes.\n", + "\n", + "de Bruijn graphs can be processed to find Euler paths, while Overlap graphs can be processed to find Hamiltonian paths. Both of these are valid ways to reconstruct the original genome.\n", + "\n", + "Use one of these two techniques to reconstruct the sequence, and print out your reconstructed sequence. Which method did you pick out of the two, and why? (hint: imagine what would happen when we have millions of reads). Use the k-mers provided in TeleTubby.fastq.\n", + "\n", + "We provide some skeleton code that you may use, but you may also come up with your own solution." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "b0f8979b", + "metadata": {}, + "outputs": [], + "source": [ + "# Read reads into graph\n", + "\n", + "def build_graph(k_mers):\n", + " edges = []\n", + " nodes = []\n", + " count = len(k_mers)\n", + " \n", + " for i in range(count):\n", + " # add node\n", + " nodes.append(k_mers[i])\n", + "\n", + " # add edge\n", + " pref = k_mers[i][:-1]\n", + " suff = k_mers[i][1:]\n", + " edges.append((pref, suff))\n", + " \n", + " \n", + " # get start node \n", + " for x in k_mers:\n", + " if x[:-1] not in nodes:\n", + " start = x[:-1]\n", + "\n", + " return nodes, edges, start\n", + "\n", + "nodes_bruijn, edges_bruijn, start = build_graph(sequence_reads)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "4d17dd59", + "metadata": {}, + "outputs": [], + "source": [ + "# Implement assembly algorithm\n", + "#Algorithm Completed in In[41]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "b345a779", + "metadata": {}, + "outputs": [], + "source": [ + "# Print assembled sequence\n", + "# Printed and completed in In[42]" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "1da22daf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The gvmagic extension is already loaded. To reload it, use:\n", + " %reload_ext gvmagic\n" + ] + } + ], + "source": [ + "# Needed to properly visualize graphs\n", + "%load_ext gvmagic" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "62f3fc16", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": "\n\nde Bruijn graph\n\n\n\nTTACAAG\n\nTTACAAG\n\n\n\nTACAAGA\n\nTACAAGA\n\n\n\nTTACAAG->TACAAGA\n\n\n\n\n\nACAAGAA\n\nACAAGAA\n\n\n\nTACAAGA->ACAAGAA\n\n\n\n\n\nTCATGCA\n\nTCATGCA\n\n\n\nCATGCAT\n\nCATGCAT\n\n\n\nTCATGCA->CATGCAT\n\n\n\n\n\nATGCATC\n\nATGCATC\n\n\n\nCATGCAT->ATGCATC\n\n\n\n\n\nAGGCGCT\n\nAGGCGCT\n\n\n\nGGCGCTA\n\nGGCGCTA\n\n\n\nAGGCGCT->GGCGCTA\n\n\n\n\n\nGCGCTAT\n\nGCGCTAT\n\n\n\nGGCGCTA->GCGCTAT\n\n\n\n\n\nCGGACTC\n\nCGGACTC\n\n\n\nGGACTCT\n\nGGACTCT\n\n\n\nCGGACTC->GGACTCT\n\n\n\n\n\nGACTCTT\n\nGACTCTT\n\n\n\nGGACTCT->GACTCTT\n\n\n\n\n\nTTTCATG\n\nTTTCATG\n\n\n\nTTCATGA\n\nTTCATGA\n\n\n\nTTTCATG->TTCATGA\n\n\n\n\n\nTCATGAG\n\nTCATGAG\n\n\n\nTTCATGA->TCATGAG\n\n\n\n\n\nCGCTATC\n\nCGCTATC\n\n\n\nGCGCTAT->CGCTATC\n\n\n\n\n\nGCTATCG\n\nGCTATCG\n\n\n\nCGCTATC->GCTATCG\n\n\n\n\n\nCGCTATC->GCTATCG\n\n\n\n\n\nCGACTGT\n\nCGACTGT\n\n\n\nGACTGTC\n\nGACTGTC\n\n\n\nCGACTGT->GACTGTC\n\n\n\n\n\nACTGTCG\n\nACTGTCG\n\n\n\nGACTGTC->ACTGTCG\n\n\n\n\n\nATCGCAA\n\nATCGCAA\n\n\n\nTCGCAAC\n\nTCGCAAC\n\n\n\nATCGCAA->TCGCAAC\n\n\n\n\n\nTCGCAAA\n\nTCGCAAA\n\n\n\nATCGCAA->TCGCAAA\n\n\n\n\n\nCGCAACC\n\nCGCAACC\n\n\n\nTCGCAAC->CGCAACC\n\n\n\n\n\nTCGCTGC\n\nTCGCTGC\n\n\n\nCGCTGCC\n\nCGCTGCC\n\n\n\nTCGCTGC->CGCTGCC\n\n\n\n\n\nGCTGCCC\n\nGCTGCCC\n\n\n\nCGCTGCC->GCTGCCC\n\n\n\n\n\nCGCAAAC\n\nCGCAAAC\n\n\n\nGCAAACC\n\nGCAAACC\n\n\n\nCGCAAAC->GCAAACC\n\n\n\n\n\nCAAACCG\n\nCAAACCG\n\n\n\nGCAAACC->CAAACCG\n\n\n\n\n\nCAAGAAT\n\nCAAGAAT\n\n\n\nAAGAATT\n\nAAGAATT\n\n\n\nCAAGAAT->AAGAATT\n\n\n\n\n\nAGAATTA\n\nAGAATTA\n\n\n\nAAGAATT->AGAATTA\n\n\n\n\n\nGTCGGAC\n\nGTCGGAC\n\n\n\nTCGGACT\n\nTCGGACT\n\n\n\nGTCGGAC->TCGGACT\n\n\n\n\n\nTCGGACT->CGGACTC\n\n\n\n\n\nCCGCCCT\n\nCCGCCCT\n\n\n\nCGCCCTT\n\nCGCCCTT\n\n\n\nCCGCCCT->CGCCCTT\n\n\n\n\n\nGCCCTTC\n\nGCCCTTC\n\n\n\nCGCCCTT->GCCCTTC\n\n\n\n\n\nCGAGAGA\n\nCGAGAGA\n\n\n\nGAGAGAT\n\nGAGAGAT\n\n\n\nCGAGAGA->GAGAGAT\n\n\n\n\n\nAGAGATT\n\nAGAGATT\n\n\n\nGAGAGAT->AGAGATT\n\n\n\n\n\nGAGGCGC\n\nGAGGCGC\n\n\n\nGAGGCGC->AGGCGCT\n\n\n\n\n\nTTGTGCT\n\nTTGTGCT\n\n\n\nTGTGCTA\n\nTGTGCTA\n\n\n\nTTGTGCT->TGTGCTA\n\n\n\n\n\nGTGCTAT\n\nGTGCTAT\n\n\n\nTGTGCTA->GTGCTAT\n\n\n\n\n\nGGGAGTA\n\nGGGAGTA\n\n\n\nGGAGTAT\n\nGGAGTAT\n\n\n\nGGGAGTA->GGAGTAT\n\n\n\n\n\nGAGTATG\n\nGAGTATG\n\n\n\nGGAGTAT->GAGTATG\n\n\n\n\n\nGTTAGCA\n\nGTTAGCA\n\n\n\nTTAGCAG\n\nTTAGCAG\n\n\n\nGTTAGCA->TTAGCAG\n\n\n\n\n\nTAGCAGT\n\nTAGCAGT\n\n\n\nTTAGCAG->TAGCAGT\n\n\n\n\n\nTGTCGGA\n\nTGTCGGA\n\n\n\nTGTCGGA->GTCGGAC\n\n\n\n\n\nCCCGGAG\n\nCCCGGAG\n\n\n\nCCGGAGG\n\nCCGGAGG\n\n\n\nCCCGGAG->CCGGAGG\n\n\n\n\n\nCGGAGGC\n\nCGGAGGC\n\n\n\nCCGGAGG->CGGAGGC\n\n\n\n\n\nGCACATC\n\nGCACATC\n\n\n\nCACATCC\n\nCACATCC\n\n\n\nGCACATC->CACATCC\n\n\n\n\n\nACATCCG\n\nACATCCG\n\n\n\nCACATCC->ACATCCG\n\n\n\n\n\nAGGGGTT\n\nAGGGGTT\n\n\n\nGGGGTTT\n\nGGGGTTT\n\n\n\nAGGGGTT->GGGGTTT\n\n\n\n\n\nGGGTTTT\n\nGGGTTTT\n\n\n\nGGGGTTT->GGGTTTT\n\n\n\n\n\nCTATCGC\n\nCTATCGC\n\n\n\nGCTATCG->CTATCGC\n\n\n\n\n\nGCTATCG->CTATCGC\n\n\n\n\n\nGCTATCG->CTATCGC\n\n\n\n\n\nTAACAAC\n\nTAACAAC\n\n\n\nAACAACT\n\nAACAACT\n\n\n\nTAACAAC->AACAACT\n\n\n\n\n\nACAACTT\n\nACAACTT\n\n\n\nAACAACT->ACAACTT\n\n\n\n\n\nTGGTGCC\n\nTGGTGCC\n\n\n\nGGTGCCG\n\nGGTGCCG\n\n\n\nTGGTGCC->GGTGCCG\n\n\n\n\n\nGTGCCGC\n\nGTGCCGC\n\n\n\nGGTGCCG->GTGCCGC\n\n\n\n\n\nTGCTATC\n\nTGCTATC\n\n\n\nGTGCTAT->TGCTATC\n\n\n\n\n\nTGCTATC->GCTATCG\n\n\n\n\n\nCGACATA\n\nCGACATA\n\n\n\nGACATAT\n\nGACATAT\n\n\n\nCGACATA->GACATAT\n\n\n\n\n\nACATATC\n\nACATATC\n\n\n\nGACATAT->ACATATC\n\n\n\n\n\nCATGAGC\n\nCATGAGC\n\n\n\nTCATGAG->CATGAGC\n\n\n\n\n\nAGTGGGA\n\nAGTGGGA\n\n\n\nGTGGGAG\n\nGTGGGAG\n\n\n\nAGTGGGA->GTGGGAG\n\n\n\n\n\nTGGGAGT\n\nTGGGAGT\n\n\n\nGTGGGAG->TGGGAGT\n\n\n\n\n\nTACGCCA\n\nTACGCCA\n\n\n\nACGCCAA\n\nACGCCAA\n\n\n\nTACGCCA->ACGCCAA\n\n\n\n\n\nCGCCAAA\n\nCGCCAAA\n\n\n\nACGCCAA->CGCCAAA\n\n\n\n\n\nTTGTTTC\n\nTTGTTTC\n\n\n\nTGTTTCT\n\nTGTTTCT\n\n\n\nTTGTTTC->TGTTTCT\n\n\n\n\n\nGTTTCTC\n\nGTTTCTC\n\n\n\nTGTTTCT->GTTTCTC\n\n\n\n\n\nGATGCAA\n\nGATGCAA\n\n\n\nATGCAAT\n\nATGCAAT\n\n\n\nGATGCAA->ATGCAAT\n\n\n\n\n\nTGCAATG\n\nTGCAATG\n\n\n\nATGCAAT->TGCAATG\n\n\n\n\n\nCGCAGGA\n\nCGCAGGA\n\n\n\nGCAGGAT\n\nGCAGGAT\n\n\n\nCGCAGGA->GCAGGAT\n\n\n\n\n\nCAGGATA\n\nCAGGATA\n\n\n\nGCAGGAT->CAGGATA\n\n\n\n\n\nCTAAGAG\n\nCTAAGAG\n\n\n\nTAAGAGA\n\nTAAGAGA\n\n\n\nCTAAGAG->TAAGAGA\n\n\n\n\n\nAAGAGAG\n\nAAGAGAG\n\n\n\nTAAGAGA->AAGAGAG\n\n\n\n\n\nTAGCAAT\n\nTAGCAAT\n\n\n\nAGCAATG\n\nAGCAATG\n\n\n\nTAGCAAT->AGCAATG\n\n\n\n\n\nGCAATGC\n\nGCAATGC\n\n\n\nAGCAATG->GCAATGC\n\n\n\n\n\nATAGCAA\n\nATAGCAA\n\n\n\nATAGCAA->TAGCAAT\n\n\n\n\n\nAGTATGG\n\nAGTATGG\n\n\n\nGAGTATG->AGTATGG\n\n\n\n\n\nCCAAACA\n\nCCAAACA\n\n\n\nCAAACAC\n\nCAAACAC\n\n\n\nCCAAACA->CAAACAC\n\n\n\n\n\nAAACACT\n\nAAACACT\n\n\n\nCAAACAC->AAACACT\n\n\n\n\n\nCTGCCCG\n\nCTGCCCG\n\n\n\nGCTGCCC->CTGCCCG\n\n\n\n\n\nTGCCCGG\n\nTGCCCGG\n\n\n\nCTGCCCG->TGCCCGG\n\n\n\n\n\nAAATAGC\n\nAAATAGC\n\n\n\nAATAGCA\n\nAATAGCA\n\n\n\nAAATAGC->AATAGCA\n\n\n\n\n\nAATAGCA->ATAGCAA\n\n\n\n\n\nAGGATAA\n\nAGGATAA\n\n\n\nCAGGATA->AGGATAA\n\n\n\n\n\nTATCGAC\n\nTATCGAC\n\n\n\nATCGACA\n\nATCGACA\n\n\n\nTATCGAC->ATCGACA\n\n\n\n\n\nTCGACAT\n\nTCGACAT\n\n\n\nATCGACA->TCGACAT\n\n\n\n\n\nCTTTACA\n\nCTTTACA\n\n\n\nTTTACAA\n\nTTTACAA\n\n\n\nCTTTACA->TTTACAA\n\n\n\n\n\nTTTACAA->TTACAAG\n\n\n\n\n\nAGAGAGA\n\nAGAGAGA\n\n\n\nAAGAGAG->AGAGAGA\n\n\n\n\n\nTCTTTCA\n\nTCTTTCA\n\n\n\nCTTTCAT\n\nCTTTCAT\n\n\n\nTCTTTCA->CTTTCAT\n\n\n\n\n\nCTTTCAT->TTTCATG\n\n\n\n\n\nAAACCGA\n\nAAACCGA\n\n\n\nCAAACCG->AAACCGA\n\n\n\n\n\nAACCGAC\n\nAACCGAC\n\n\n\nAAACCGA->AACCGAC\n\n\n\n\n\nATGGTGC\n\nATGGTGC\n\n\n\nTGGTGCA\n\nTGGTGCA\n\n\n\nATGGTGC->TGGTGCA\n\n\n\n\n\nGGTGCAC\n\nGGTGCAC\n\n\n\nTGGTGCA->GGTGCAC\n\n\n\n\n\nTGGGAGT->GGGAGTA\n\n\n\n\n\nTCTTTAC\n\nTCTTTAC\n\n\n\nTCTTTAC->CTTTACA\n\n\n\n\n\nTTACAGG\n\nTTACAGG\n\n\n\nTACAGGA\n\nTACAGGA\n\n\n\nTTACAGG->TACAGGA\n\n\n\n\n\nACAGGAG\n\nACAGGAG\n\n\n\nTACAGGA->ACAGGAG\n\n\n\n\n\nTTTCTCG\n\nTTTCTCG\n\n\n\nGTTTCTC->TTTCTCG\n\n\n\n\n\nGATTTGT\n\nGATTTGT\n\n\n\nATTTGTG\n\nATTTGTG\n\n\n\nGATTTGT->ATTTGTG\n\n\n\n\n\nTTTGTGC\n\nTTTGTGC\n\n\n\nATTTGTG->TTTGTGC\n\n\n\n\n\nACTTATG\n\nACTTATG\n\n\n\nCTTATGT\n\nCTTATGT\n\n\n\nACTTATG->CTTATGT\n\n\n\n\n\nTTATGTA\n\nTTATGTA\n\n\n\nCTTATGT->TTATGTA\n\n\n\n\n\nGTTGTTT\n\nGTTGTTT\n\n\n\nGTTGTTT->TTGTTTC\n\n\n\n\n\nGCCCGGA\n\nGCCCGGA\n\n\n\nTGCCCGG->GCCCGGA\n\n\n\n\n\nGCCCGGA->CCCGGAG\n\n\n\n\n\nGGAGGCG\n\nGGAGGCG\n\n\n\nGGAGGCG->GAGGCGC\n\n\n\n\n\nAACACTC\n\nAACACTC\n\n\n\nAAACACT->AACACTC\n\n\n\n\n\nACACTCG\n\nACACTCG\n\n\n\nAACACTC->ACACTCG\n\n\n\n\n\nTATCGCT\n\nTATCGCT\n\n\n\nCTATCGC->TATCGCT\n\n\n\n\n\nTATCGCA\n\nTATCGCA\n\n\n\nCTATCGC->TATCGCA\n\n\n\n\n\nCTATCGC->TATCGCA\n\n\n\n\n\nGGTTTTG\n\nGGTTTTG\n\n\n\nGGGTTTT->GGTTTTG\n\n\n\n\n\nGTTTTGT\n\nGTTTTGT\n\n\n\nGGTTTTG->GTTTTGT\n\n\n\n\n\nTGCCCGC\n\nTGCCCGC\n\n\n\nGCCCGCC\n\nGCCCGCC\n\n\n\nTGCCCGC->GCCCGCC\n\n\n\n\n\nCCCGCCA\n\nCCCGCCA\n\n\n\nGCCCGCC->CCCGCCA\n\n\n\n\n\nTCATGGT\n\nTCATGGT\n\n\n\nCATGGTA\n\nCATGGTA\n\n\n\nTCATGGT->CATGGTA\n\n\n\n\n\nATGGTAT\n\nATGGTAT\n\n\n\nCATGGTA->ATGGTAT\n\n\n\n\n\nATCGCTG\n\nATCGCTG\n\n\n\nTATCGCT->ATCGCTG\n\n\n\n\n\nATCGCTA\n\nATCGCTA\n\n\n\nTATCGCT->ATCGCTA\n\n\n\n\n\nATCGCTG->TCGCTGC\n\n\n\n\n\nTGGTATC\n\nTGGTATC\n\n\n\nGGTATCG\n\nGGTATCG\n\n\n\nTGGTATC->GGTATCG\n\n\n\n\n\nGTATCGA\n\nGTATCGA\n\n\n\nGGTATCG->GTATCGA\n\n\n\n\n\nCATATCG\n\nCATATCG\n\n\n\nACATATC->CATATCG\n\n\n\n\n\nATAACAA\n\nATAACAA\n\n\n\nATAACAA->TAACAAC\n\n\n\n\n\nGTATCGA->TATCGAC\n\n\n\n\n\nAGATTTG\n\nAGATTTG\n\n\n\nAGATTTG->GATTTGT\n\n\n\n\n\nCCGACTG\n\nCCGACTG\n\n\n\nCCGACTG->CGACTGT\n\n\n\n\n\nACCGACT\n\nACCGACT\n\n\n\nACCGACT->CCGACTG\n\n\n\n\n\nGCAACCT\n\nGCAACCT\n\n\n\nCAACCTA\n\nCAACCTA\n\n\n\nGCAACCT->CAACCTA\n\n\n\n\n\nAACCTAA\n\nAACCTAA\n\n\n\nCAACCTA->AACCTAA\n\n\n\n\n\nCGCAACC->GCAACCT\n\n\n\n\n\nATGAGCA\n\nATGAGCA\n\n\n\nTGAGCAA\n\nTGAGCAA\n\n\n\nATGAGCA->TGAGCAA\n\n\n\n\n\nGAGCAAA\n\nGAGCAAA\n\n\n\nTGAGCAA->GAGCAAA\n\n\n\n\n\nGATAACA\n\nGATAACA\n\n\n\nGATAACA->ATAACAA\n\n\n\n\n\nTGTACTA\n\nTGTACTA\n\n\n\nGTACTAC\n\nGTACTAC\n\n\n\nTGTACTA->GTACTAC\n\n\n\n\n\nTACTACA\n\nTACTACA\n\n\n\nGTACTAC->TACTACA\n\n\n\n\n\nCATCCGC\n\nCATCCGC\n\n\n\nATCCGCT\n\nATCCGCT\n\n\n\nCATCCGC->ATCCGCT\n\n\n\n\n\nTCCGCTA\n\nTCCGCTA\n\n\n\nATCCGCT->TCCGCTA\n\n\n\n\n\nTGCGCAG\n\nTGCGCAG\n\n\n\nGCGCAGG\n\nGCGCAGG\n\n\n\nTGCGCAG->GCGCAGG\n\n\n\n\n\nGCGCAGG->CGCAGGA\n\n\n\n\n\nCAGGAGC\n\nCAGGAGC\n\n\n\nACAGGAG->CAGGAGC\n\n\n\n\n\nTATGTAC\n\nTATGTAC\n\n\n\nTTATGTA->TATGTAC\n\n\n\n\n\nACTACAT\n\nACTACAT\n\n\n\nTACTACA->ACTACAT\n\n\n\n\n\nCTACATG\n\nCTACATG\n\n\n\nACTACAT->CTACATG\n\n\n\n\n\nTCGCTAC\n\nTCGCTAC\n\n\n\nATCGCTA->TCGCTAC\n\n\n\n\n\nCGCTACT\n\nCGCTACT\n\n\n\nTCGCTAC->CGCTACT\n\n\n\n\n\nCTACTGG\n\nCTACTGG\n\n\n\nTACTGGT\n\nTACTGGT\n\n\n\nCTACTGG->TACTGGT\n\n\n\n\n\nACTGGTG\n\nACTGGTG\n\n\n\nTACTGGT->ACTGGTG\n\n\n\n\n\nACAAGAA->CAAGAAT\n\n\n\n\n\nGTTTCTT\n\nGTTTCTT\n\n\n\nTTTCTTC\n\nTTTCTTC\n\n\n\nGTTTCTT->TTTCTTC\n\n\n\n\n\nTTCTTCA\n\nTTCTTCA\n\n\n\nTTTCTTC->TTCTTCA\n\n\n\n\n\nCCCTTCG\n\nCCCTTCG\n\n\n\nGCCCTTC->CCCTTCG\n\n\n\n\n\nCCTTCGA\n\nCCTTCGA\n\n\n\nCCCTTCG->CCTTCGA\n\n\n\n\n\nAGCAAAA\n\nAGCAAAA\n\n\n\nGCAAAAA\n\nGCAAAAA\n\n\n\nAGCAAAA->GCAAAAA\n\n\n\n\n\nCAAAAAA\n\nCAAAAAA\n\n\n\nGCAAAAA->CAAAAAA\n\n\n\n\n\nATGTACT\n\nATGTACT\n\n\n\nTATGTAC->ATGTACT\n\n\n\n\n\nATGTACT->TGTACTA\n\n\n\n\n\nTCGCAAA->CGCAAAC\n\n\n\n\n\nCTTCGAT\n\nCTTCGAT\n\n\n\nCCTTCGA->CTTCGAT\n\n\n\n\n\nTTCGATG\n\nTTCGATG\n\n\n\nCTTCGAT->TTCGATG\n\n\n\n\n\nTTTTGTG\n\nTTTTGTG\n\n\n\nGTTTTGT->TTTTGTG\n\n\n\n\n\nGAATTAC\n\nGAATTAC\n\n\n\nAATTACA\n\nAATTACA\n\n\n\nGAATTAC->AATTACA\n\n\n\n\n\nATTACAG\n\nATTACAG\n\n\n\nAATTACA->ATTACAG\n\n\n\n\n\nTATCGCA->ATCGCAA\n\n\n\n\n\nTATCGCA->ATCGCAA\n\n\n\n\n\nTACATGT\n\nTACATGT\n\n\n\nCTACATG->TACATGT\n\n\n\n\n\nACATGTT\n\nACATGTT\n\n\n\nTACATGT->ACATGTT\n\n\n\n\n\nGCCAAAT\n\nGCCAAAT\n\n\n\nCGCCAAA->GCCAAAT\n\n\n\n\n\nCCAAATA\n\nCCAAATA\n\n\n\nGCCAAAT->CCAAATA\n\n\n\n\n\nTCGCTGT\n\nTCGCTGT\n\n\n\nCGCTGTC\n\nCGCTGTC\n\n\n\nTCGCTGT->CGCTGTC\n\n\n\n\n\nGCTGTCA\n\nGCTGTCA\n\n\n\nCGCTGTC->GCTGTCA\n\n\n\n\n\nGTGCACA\n\nGTGCACA\n\n\n\nTGCACAT\n\nTGCACAT\n\n\n\nGTGCACA->TGCACAT\n\n\n\n\n\nTGCACAT->GCACATC\n\n\n\n\n\nTGTCATG\n\nTGTCATG\n\n\n\nGTCATGG\n\nGTCATGG\n\n\n\nTGTCATG->GTCATGG\n\n\n\n\n\nGTCATGG->TCATGGT\n\n\n\n\n\nGAGATTT\n\nGAGATTT\n\n\n\nAGAGATT->GAGATTT\n\n\n\n\n\nTCGTGCC\n\nTCGTGCC\n\n\n\nCGTGCCC\n\nCGTGCCC\n\n\n\nTCGTGCC->CGTGCCC\n\n\n\n\n\nGTGCCCG\n\nGTGCCCG\n\n\n\nCGTGCCC->GTGCCCG\n\n\n\n\n\nAGGAGCC\n\nAGGAGCC\n\n\n\nCAGGAGC->AGGAGCC\n\n\n\n\n\nGGAGCCA\n\nGGAGCCA\n\n\n\nAGGAGCC->GGAGCCA\n\n\n\n\n\nGTGCCCG->TGCCCGC\n\n\n\n\n\nACATCCG->CATCCGC\n\n\n\n\n\nTCGATGC\n\nTCGATGC\n\n\n\nTTCGATG->TCGATGC\n\n\n\n\n\nCGATGCA\n\nCGATGCA\n\n\n\nTCGATGC->CGATGCA\n\n\n\n\n\nCAATGCG\n\nCAATGCG\n\n\n\nGCAATGC->CAATGCG\n\n\n\n\n\nAATGCGC\n\nAATGCGC\n\n\n\nCAATGCG->AATGCGC\n\n\n\n\n\nAGCCAAA\n\nAGCCAAA\n\n\n\nGCCAAAC\n\nGCCAAAC\n\n\n\nAGCCAAA->GCCAAAC\n\n\n\n\n\nGCCAAAC->CCAAACA\n\n\n\n\n\nCATGTTG\n\nCATGTTG\n\n\n\nATGTTGT\n\nATGTTGT\n\n\n\nCATGTTG->ATGTTGT\n\n\n\n\n\nTGTTGTT\n\nTGTTGTT\n\n\n\nATGTTGT->TGTTGTT\n\n\n\n\n\nGCTACTG\n\nGCTACTG\n\n\n\nGCTACTG->CTACTGG\n\n\n\n\n\nTCGAGAG\n\nTCGAGAG\n\n\n\nTCGAGAG->CGAGAGA\n\n\n\n\n\nGAGAGAA\n\nGAGAGAA\n\n\n\nAGAGAGA->GAGAGAA\n\n\n\n\n\nCATGAGC->ATGAGCA\n\n\n\n\n\nGTGTTAG\n\nGTGTTAG\n\n\n\nTGTTAGC\n\nTGTTAGC\n\n\n\nGTGTTAG->TGTTAGC\n\n\n\n\n\nTGTTAGC->GTTAGCA\n\n\n\n\n\nCAATGTC\n\nCAATGTC\n\n\n\nAATGTCG\n\nAATGTCG\n\n\n\nCAATGTC->AATGTCG\n\n\n\n\n\nATGTCGA\n\nATGTCGA\n\n\n\nAATGTCG->ATGTCGA\n\n\n\n\n\nGAGATTT->AGATTTG\n\n\n\n\n\nATGGTAT->TGGTATC\n\n\n\n\n\nGAGAAGG\n\nGAGAAGG\n\n\n\nAGAAGGG\n\nAGAAGGG\n\n\n\nGAGAAGG->AGAAGGG\n\n\n\n\n\nGAAGGGG\n\nGAAGGGG\n\n\n\nAGAAGGG->GAAGGGG\n\n\n\n\n\nTGTCGAG\n\nTGTCGAG\n\n\n\nATGTCGA->TGTCGAG\n\n\n\n\n\nTGCCGCC\n\nTGCCGCC\n\n\n\nGTGCCGC->TGCCGCC\n\n\n\n\n\nCTGTCGG\n\nCTGTCGG\n\n\n\nCTGTCGG->TGTCGGA\n\n\n\n\n\nAGCAGTT\n\nAGCAGTT\n\n\n\nTAGCAGT->AGCAGTT\n\n\n\n\n\nGCAGTTT\n\nGCAGTTT\n\n\n\nAGCAGTT->GCAGTTT\n\n\n\n\n\nCAACTTA\n\nCAACTTA\n\n\n\nAACTTAT\n\nAACTTAT\n\n\n\nCAACTTA->AACTTAT\n\n\n\n\n\nAACTTAT->ACTTATG\n\n\n\n\n\nCTGGTGC\n\nCTGGTGC\n\n\n\nACTGGTG->CTGGTGC\n\n\n\n\n\nCTGGTGC->TGGTGCC\n\n\n\n\n\nCGGAGGC->GGAGGCG\n\n\n\n\n\nCACTCGC\n\nCACTCGC\n\n\n\nACTCGCT\n\nACTCGCT\n\n\n\nCACTCGC->ACTCGCT\n\n\n\n\n\nCTCGCTG\n\nCTCGCTG\n\n\n\nACTCGCT->CTCGCTG\n\n\n\n\n\nCTCTTTC\n\nCTCTTTC\n\n\n\nCTCTTTC->TCTTTCA\n\n\n\n\n\nTTCATGC\n\nTTCATGC\n\n\n\nTTCATGC->TCATGCA\n\n\n\n\n\nATATCGC\n\nATATCGC\n\n\n\nCATATCG->ATATCGC\n\n\n\n\n\nTCTCGTG\n\nTCTCGTG\n\n\n\nCTCGTGC\n\nCTCGTGC\n\n\n\nTCTCGTG->CTCGTGC\n\n\n\n\n\nCTCGTGC->TCGTGCC\n\n\n\n\n\nTCTTCAT\n\nTCTTCAT\n\n\n\nTTCTTCA->TCTTCAT\n\n\n\n\n\nCTTCATG\n\nCTTCATG\n\n\n\nTCTTCAT->CTTCATG\n\n\n\n\n\nAGAATTA->GAATTAC\n\n\n\n\n\nTGTTGTT->GTTGTTT\n\n\n\n\n\nTTCTCGT\n\nTTCTCGT\n\n\n\nTTTCTCG->TTCTCGT\n\n\n\n\n\nCAGTTTC\n\nCAGTTTC\n\n\n\nAGTTTCT\n\nAGTTTCT\n\n\n\nCAGTTTC->AGTTTCT\n\n\n\n\n\nAGTTTCT->GTTTCTT\n\n\n\n\n\nCAAATAG\n\nCAAATAG\n\n\n\nCCAAATA->CAAATAG\n\n\n\n\n\nAGAGAAG\n\nAGAGAAG\n\n\n\nGAGAGAA->AGAGAAG\n\n\n\n\n\nAGAGAAG->GAGAAGG\n\n\n\n\n\nTGCATCT\n\nTGCATCT\n\n\n\nGCATCTC\n\nGCATCTC\n\n\n\nTGCATCT->GCATCTC\n\n\n\n\n\nCATCTCT\n\nCATCTCT\n\n\n\nGCATCTC->CATCTCT\n\n\n\n\n\nGAGCCAA\n\nGAGCCAA\n\n\n\nGGAGCCA->GAGCCAA\n\n\n\n\n\nGAGCCAA->AGCCAAA\n\n\n\n\n\nAAGGGGT\n\nAAGGGGT\n\n\n\nGAAGGGG->AAGGGGT\n\n\n\n\n\nAAGGGGT->AGGGGTT\n\n\n\n\n\nTTTGTGC->TTGTGCT\n\n\n\n\n\nCTGTCAT\n\nCTGTCAT\n\n\n\nGCTGTCA->CTGTCAT\n\n\n\n\n\nACTCTTT\n\nACTCTTT\n\n\n\nGACTCTT->ACTCTTT\n\n\n\n\n\nTTTGTGT\n\nTTTGTGT\n\n\n\nTTGTGTT\n\nTTGTGTT\n\n\n\nTTTGTGT->TTGTGTT\n\n\n\n\n\nTGTGTTA\n\nTGTGTTA\n\n\n\nTTGTGTT->TGTGTTA\n\n\n\n\n\nGCCGCCC\n\nGCCGCCC\n\n\n\nTGCCGCC->GCCGCCC\n\n\n\n\n\nGCCGCCC->CCGCCCT\n\n\n\n\n\nCCTAAGA\n\nCCTAAGA\n\n\n\nCCTAAGA->CTAAGAG\n\n\n\n\n\nGTCGAGA\n\nGTCGAGA\n\n\n\nTGTCGAG->GTCGAGA\n\n\n\n\n\nACTCTTT->CTCTTTC\n\n\n\n\n\nACAACTT->CAACTTA\n\n\n\n\n\nAAAAAAG\n\nAAAAAAG\n\n\n\nAAAAAGT\n\nAAAAAGT\n\n\n\nAAAAAAG->AAAAAGT\n\n\n\n\n\nAAAAGTG\n\nAAAAGTG\n\n\n\nAAAAAGT->AAAAGTG\n\n\n\n\n\nTATGGTG\n\nTATGGTG\n\n\n\nTATGGTG->ATGGTGC\n\n\n\n\n\nAACCGAC->ACCGACT\n\n\n\n\n\nATGCGCA\n\nATGCGCA\n\n\n\nAATGCGC->ATGCGCA\n\n\n\n\n\nACTGTCG->CTGTCGG\n\n\n\n\n\nATCTCTT\n\nATCTCTT\n\n\n\nTCTCTTT\n\nTCTCTTT\n\n\n\nATCTCTT->TCTCTTT\n\n\n\n\n\nCTCTTTA\n\nCTCTTTA\n\n\n\nTCTCTTT->CTCTTTA\n\n\n\n\n\nACCTAAG\n\nACCTAAG\n\n\n\nAACCTAA->ACCTAAG\n\n\n\n\n\nCCGCTAT\n\nCCGCTAT\n\n\n\nTCCGCTA->CCGCTAT\n\n\n\n\n\nAAGTGGG\n\nAAGTGGG\n\n\n\nAAGTGGG->AGTGGGA\n\n\n\n\n\nTTTTGTG->TTTGTGT\n\n\n\n\n\nGCAGTTT->CAGTTTC\n\n\n\n\n\nCGCCAAT\n\nCGCCAAT\n\n\n\nGCCAATG\n\nGCCAATG\n\n\n\nCGCCAAT->GCCAATG\n\n\n\n\n\nCCAATGT\n\nCCAATGT\n\n\n\nGCCAATG->CCAATGT\n\n\n\n\n\nCATCTCT->ATCTCTT\n\n\n\n\n\nCTGTCAT->TGTCATG\n\n\n\n\n\nGGATAAC\n\nGGATAAC\n\n\n\nGGATAAC->GATAACA\n\n\n\n\n\nCGATGCA->GATGCAA\n\n\n\n\n\nCCAATGT->CAATGTC\n\n\n\n\n\nCGCTACT->GCTACTG\n\n\n\n\n\nACATGTT->CATGTTG\n\n\n\n\n\nCTCGCTG->TCGCTGT\n\n\n\n\n\nCAAATAG->AAATAGC\n\n\n\n\n\nACCTAAG->CCTAAGA\n\n\n\n\n\nGCAATGT\n\nGCAATGT\n\n\n\nCAATGTT\n\nCAATGTT\n\n\n\nGCAATGT->CAATGTT\n\n\n\n\n\nAAAAAAA\n\nAAAAAAA\n\n\n\nAAAAAAA->AAAAAAG\n\n\n\n\n\nATTACAG->TTACAGG\n\n\n\n\n\nCTCTTTA->TCTTTAC\n\n\n\n\n\nAGGATAA->GGATAAC\n\n\n\n\n\nGAGCAAA->AGCAAAA\n\n\n\n\n\nATGCATC->TGCATCT\n\n\n\n\n\nATGCGCA->TGCGCAG\n\n\n\n\n\nATATCGC->TATCGCT\n\n\n\n\n\nCCGCCAA\n\nCCGCCAA\n\n\n\nCCGCCAA->CGCCAAT\n\n\n\n\n\nCCCGCCA->CCGCCAA\n\n\n\n\n\nGTATGGT\n\nGTATGGT\n\n\n\nAGTATGG->GTATGGT\n\n\n\n\n\nTTCTCGT->TCTCGTG\n\n\n\n\n\nCAAAAAA->AAAAAAA\n\n\n\n\n\nGGTGCAC->GTGCACA\n\n\n\n\n\nTCGACAT->CGACATA\n\n\n\n\n\nGTCGAGA->TCGAGAG\n\n\n\n\n\nCCGCTAT->CGCTATC\n\n\n\n\n\nTGTGTTA->GTGTTAG\n\n\n\n\n\nTGCAATG->GCAATGT\n\n\n\n\n\nCTTCATG->TTCATGC\n\n\n\n\n\nACACTCG->CACTCGC\n\n\n\n\n\nGTATGGT->TATGGTG\n\n\n\n\n\nAAAGTGG\n\nAAAGTGG\n\n\n\nAAAAGTG->AAAGTGG\n\n\n\n\n\nAAAGTGG->AAGTGGG\n\n\n\n\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Helper function to visualize de Bruijn graphs\n", + "%dotstr utils.viz_debruijn(nodes_bruijn, edges_bruijn)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "483556bf", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "nodes_bruijn, edges_bruijn, start = build_graph(sequence_reads)\n", + "\n", + "def assemble_sequence(nodes, edges, start):\n", + " assembledSeq = ''\n", + " s = []\n", + " s.append(start)\n", + " \n", + " next = 1\n", + " sCount = len(s)\n", + " while sCount > 0:\n", + " end = True\n", + " for edge in edges: \n", + " if edge[0] == s[-1]:\n", + " end = False\n", + " edges.remove(edge)\n", + " s.append(edge[next])\n", + " sCount += 1\n", + "\n", + " \n", + " if end: \n", + " assembledSeq = assembledSeq + (s.pop(-1)[-1])\n", + " sCount -= 1\n", + "\n", + " \n", + " assembledSeq = assembledSeq[::-1]\n", + " final = ''\n", + " final = start[:-1] + assembledSeq\n", + " \n", + " return final " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "67acb135", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CCAAATAGCAATGCGCAGGATAACAACTTATGTACTACATGTTGTTTCTCGTGCCCGCCAATGTCGAGAGATTTGTGCTATCGCTGCCCGGAGGCGCTATCGCAAACCGACTGTCGGACTCTTTCATGAGCAAAAAAAGTGGGAGTATGGTGCACATCCGCTATCGCAACCTAAGAGAGAAGGGGTTTTGTGTTAGCAGTTTCTTCATGCATCTCTTTACAAGAATTACAGGAGCCAAACACTCGCTGTCATGGTATCGACATATCGCTACTGGTGCCGCCCTTCGATGCAATGTT\n", + "296\n" + ] + } + ], + "source": [ + "# Output assembled sequence\n", + "\n", + "assembled_seq = assemble_sequence(nodes_bruijn, edges_bruijn, start)\n", + "print(assembled_seq)\n", + "print(len(assembled_seq))" + ] + }, + { + "cell_type": "markdown", + "id": "fccc1f67", + "metadata": {}, + "source": [ + "## Question 2 - Sequencing SARS-CoV-2 virus" + ] + }, + { + "cell_type": "markdown", + "id": "8d40f31b", + "metadata": {}, + "source": [ + "Let's move on from TeleTubbies to real-world organisms. Let's start small- with a variant of the SARS-CoV-2 virus. You're given reads from actual genome sequencing runs in the SARS-CoV2.fastq file provided.\n", + "\n", + "Repeat Question 1.3 on this data. You can re-use your implementation and simply run it on the new data. Print out your reconstructed sequence to a file \"output.txt\". For this part, we will still assume that all the reads are error-free. Set $k=25$." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "ab873973", + "metadata": {}, + "outputs": [], + "source": [ + "# Read sequence reads\n", + "sequence_reads_covid, qualities_covid = utils.read_fastq('SARS-CoV2.fastq')" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "9f579e5c", + "metadata": {}, + "outputs": [], + "source": [ + "# Read reads into graph\n", + "nodes_covid, edges_covid, start_covid = build_graph(sequence_reads_covid)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "47d72cc1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAACCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGAACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAGAAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCTATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTGGATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTCTCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCTTTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAATTTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", + "11800\n" + ] + } + ], + "source": [ + "# Call main assembly algorithm\n", + "assembled_covid_seq = assemble_sequence(nodes_covid, edges_covid, start_covid)\n", + "print(assembled_covid_seq)\n", + "print(len(assembled_covid_seq))" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "a6d0a928", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Write assembled sequence to file\n", + "\n", + "# assmebled_seq = \"\" # Use your assembled genome\n", + "with open(\"covid_overlap.txt\", \"w\") as f:\n", + " f.write(assembled_covid_seq)" + ] + }, + { + "cell_type": "markdown", + "id": "5dbc3916", + "metadata": {}, + "source": [ + "# Question 3- Error-Aware Assembly (Extra Credit)" + ] + }, + { + "cell_type": "markdown", + "id": "228fe448", + "metadata": {}, + "source": [ + "In the parts above, we assumed error-free reads while assembling $k$-mers. As much as we'd like that, actual reads can (and do) have errors, captured by their Phred scores. For this question, you're given raw, actual reads from sequencing runs (download reads here: https://sra-pub-sars-cov2.s3.amazonaws.com/sra-src/SRR11528307/ABS2-LN-R1_cleaned_paired.fastq.gz). Given these reads and their Phred33 scores, can you assemble the genome?\n", + "\n", + "Print out your assembled sequence, along with a brief explanation of how your algorithm works\n", + "\n", + "This is an open-ended question. You are free to use any approach to deal with the issue. Make sure you provide your code, along with any assumptions you may have, in the cells below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbea667c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c16adb2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "vscode": { + "interpreter": { + "hash": "cda33f93a65b640015ae405d746ae85c987d5fba64b8857172b45a1c3f5e891a" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/project1.ipynb b/project1.ipynb deleted file mode 100644 index 8b9ec909..00000000 --- a/project1.ipynb +++ /dev/null @@ -1,469 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Project 1: Assembling Genes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "
Due: Monday, September 5, 8:59pm.
\n", - "
\n", - " \n", - "
\n", - "
\n", - " Collaboration and Resource Policy\n", - "
\n", - " \n", - "For this assignment, you are encouraged to work with one other person satisfying the constraints from Class 2. \n", - "You are permitted (actually _encouraged_) to discuss these problems with anyone you want, including other students in the class. If you do discuss the specific questions in the assignment with anyone other than your assignment partner and the course staff, though, you should list them in the _External resources used_ section below.\n", - " \n", - "You are welcome to use any resources you want for this assignment, other than ones that would defeat the purpose of the assignment. This means you should not look at answers or code from previous semesters of this course, or from any other students in the class (other than your collaboration with your partner), and if you find code that implements the problem you are being asked to do for the assignment, you should not use that code. You should document all external resource you use that are not part of the course materials in the _External resources used_ section below.\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Team submitting this assignment:** \n", - "
\n", - " list each member of your team here, including both your name and UVA computing id\n", - "\n", - "Team Members (Names): \n", - "\n", - "Team Member UVA Computing IDs:\n", - "\n", - "
\n", - "\n", - "**External resources used:** \n", - "
\n", - "It is not necessary to list the course materials, but if you used any other resources, including discussing problems with students not on your team, list them here.\n", - " \n", - "External Resources Used:\n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this project, we will explore genome assembly—the process of determining the order of nucleotides in DNA from fragmented reads. As you might have studied in the reading assignments, genome assembly can get quite complicated, as problems such as full sequence coverage, finding a good length for reads (the $k$ in $k$-mer), and sequencing errors present challenges for sequencing analysis and accuracy. You can assume perfect coverage for all parts of the assignment and no read errors for the first two questions.\n", - "\n", - "\n", - "Submission: Please submit the code you wrote to generate your answers for all parts using this form: https://forms.gle/rNTXfYojTLEQ8idg6. Your answers should be in the Jupyter Notebook, along with your code. Before submission, you should make a copy of your notebook file with the name uvaid1\\_uvaid2.ipynb (where uvaidn is each teammates UVA id) so the submitted file identifies you. You and your partner should submit a single file once together. Submission is due 8:59 pm (EST) on Monday, September 5." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Install basic required packages." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Install basic required packages, should be run only once. You may need to restart the kernel after this stage.\n", - "- Make sure you have [graphviz](https://graphviz.org/download/) installed on your system.\n", - "- The second cell adds Graphviz to your path, you may have to change based on where the install folder is.\n", - "\n", - "NOTE: We provide utils.py, which may contain helpful functions for you to use, as well as gvmagic.py, which is a deprecated package to use graphviz within the notebook" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"PATH\"] += os.pathsep + 'C:/Program Files/Graphviz/bin'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Genome Assembly\n", - "\n", - "For this part, you're given reads generated while trying to sequence the DNA of a TeleTubby (some unknown organism) with a \\textit{very} small genetic code. By answering the following questions, you will learn how to assemble the original genome sequence from sequence reads.\n", - "\n", - "Sequencing data is often stored in FASTQ file format. In TeleTubby.fastq, you will find the data organized in a particular order that repeats every four lines. The first line contains the metadata that encodes the name of the read, the experiment type, the kind of sequencing machine used, etc. The second line is the sequence of bases. The third line functions as a placeholder line. The fourth line is a sequence of base qualities that encode the qualities for the corresponding bases in the sequence line. We will only work with the sequence and quality score lines in this question." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import collections\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import utils\n", - "from tqdm import tqdm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.1.1 GC-content\n", - "\n", - "The GC-content (or the ratio of G and C nucleotides) is related to the melting temperature of the DNA double helix. Use the following equation to calculate the melting temperature of DNA for TeleTubby $t_m$ in Celsius:\n", - "\n", - "\\begin{equation*}\n", - "t_m = 64.9+0.41(\\%GC)-\\frac{500}{\\text{length of sequence}}\n", - "\\end{equation*}\n", - "\n", - "As a reference, the human genome is known to have between 35%-60% GC-content. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Read sequence reads (error-free) from file\n", - "sequence_reads, qualities = utils.read_fastq('TeleTubby.fastq')\n", - "\n", - "# Calculate %GC content\n", - "# Print out temperature in Celsius" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.1.2 Interpreting quality scores" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Phred33 quality scores are represented as the character with an ASCII code equal to its value + 33 (to make them easy to print alongside genome sequences). List the top 5 most frequent scores in ASCII symbol as well as their Phredd33 scores in TeleTubby.fastq. You can refer to the [official Illumina website](https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/QualityScoreEncoding_swBS.htm) to reference the scoring encoding.\n", - "\n", - "What is the average Phred33 score in TeleTubby.fastq?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Calculate and print average Phred33 score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.1.3 Frequency analysis\n", - "\n", - "Looking at repetitions in the sequence can be helpful in estimating the \"redudancy\" in the organisms. Humand and other evolved animals have a lot of redundancy, while smaller organisms like bacteria have highly packed genomes. One heuristic to estimate this before actually performing the assembly could be looking at how often certain $k$-mers are repeated.\n", - "\n", - "Print out the 3 most frequent k-mers with their frequencies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Find and print out the three most repeated k-mers and their frequencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Question 1.2. Greedy approach" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One of the approaches to assemble the genome from the given reads is a greedy algorithm. Have a look at the greedy algorithm described on [Wikipedia](https://en.wikipedia.org/wiki/Sequence_assembly#Greedy_algorithm) and answer the following." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.2.1 What would the runtime be of this algorithm, given $n$ $k$-mer reads?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Answer:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.2.2 Would this algorithm always yield a unique solution?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Answer:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Question 1.2.3 Would this algorithm always yield the right solution?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Answer:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Question 1.3 Graph-based approaches" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Graphs for genome assembly can be constructed in two ways:\n", - "\n", - "- de Bruijn graph: Processing $k-$mers as nodes, with $(k-1)-$mers as edges, and\n", - "- Overlap graph: Processing $k-$mers as edges, with $(k-1)-$mers as nodes.\n", - "\n", - "de Bruijn graphs can be processed to find Euler paths, while Overlap graphs can be processed to find Hamiltonian paths. Both of these are valid ways to reconstruct the original genome.\n", - "\n", - "Use one of these two techniques to reconstruct the sequence, and print out your reconstructed sequence. Which method did you pick out of the two, and why? (hint: imagine what would happen when we have millions of reads). Use the k-mers provided in TeleTubby.fastq.\n", - "\n", - "We provide some skeleton code that you may use, but you may also come up with your own solution." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Read reads into graph\n", - "\n", - "def build_graph(k_mers):\n", - " edges = []\n", - " nodes = set()\n", - " # Your code here\n", - " return nodes, edges\n", - "\n", - "nodes, edges = build_graph(sequence_reads)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Implement assembly algorithm" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Print assembled sequence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Needed to properly visualize graphs\n", - "%load_ext gvmagic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Helper function to visualize de Bruijn graphs\n", - "%dotstr utils.viz_debruijn(nodes_bruijn, edges_bruijn)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Main assembly algorithm\n", - "\n", - "def assemble_sequence(nodes, edges):\n", - " assembled_sequence = \"\"\n", - " # Your code here\n", - " return assembled_sequence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Output assembled sequence\n", - "\n", - "assmebled_seq = assemble_sequence(nodes, edges)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Question 2 - Sequencing SARS-CoV-2 virus" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's move on from TeleTubbies to real-world organisms. Let's start small- with a variant of the SARS-CoV-2 virus. You're given reads from actual genome sequencing runs in the SARS-CoV2.fastq file provided.\n", - "\n", - "Repeat Question 1.3 on this data. You can re-use your implementation and simply run it on the new data. Print out your reconstructed sequence to a file \"output.txt\". For this part, we will still assume that all the reads are error-free. Set $k=25$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Read sequence reads\n", - "sequence_reads_covid, qualities_covid = utils.read_fastq('SARS-CoV2.fastq')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Read reads into graph\n", - "nodes_covid, edges_covid = build_graph(sequence_reads_covid)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Call main assembly algorithm\n", - "assmebled_covid_seq = assemble_sequence(nodes_covid, edges_covid)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write assembled sequence to file\n", - "\n", - "assmebled_seq = \"\" # Use your assembled genome\n", - "with open(\"covid_overlap.txt\", \"w\") as f:\n", - " f.write(assmebled_seq)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Question 3- Error-Aware Assembly (Extra Credit)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the parts above, we assumed error-free reads while assembling $k$-mers. As much as we'd like that, actual reads can (and do) have errors, captured by their Phred scores. For this question, you're given raw, actual reads from sequencing runs (download reads here: https://sra-pub-sars-cov2.s3.amazonaws.com/sra-src/SRR11528307/ABS2-LN-R1_cleaned_paired.fastq.gz). Given these reads and their Phred33 scores, can you assemble the genome?\n", - "\n", - "Print out your assembled sequence, along with a brief explanation of how your algorithm works\n", - "\n", - "This is an open-ended question. You are free to use any approach to deal with the issue. Make sure you provide your code, along with any assumptions you may have, in the cells below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}