635 lines
		
	
	
	
		
			5.6 KiB
		
	
	
	
		
			Markdown
		
	
	
	
	
	
			
		
		
	
	
			635 lines
		
	
	
	
		
			5.6 KiB
		
	
	
	
		
			Markdown
		
	
	
	
	
	
| ---
 | |
| author: einar
 | |
| comments: true
 | |
| date: 2006-10-23 20:32:58+00:00
 | |
| layout: post
 | |
| slug: text-files-with-python
 | |
| title: Text files with Python
 | |
| wordpress_id: 123
 | |
| categories:
 | |
| - General
 | |
| - Linux
 | |
| - Science
 | |
| ---
 | |
| 
 | |
| Finally I cleaned up my code enough to post it here. It's probably still ugly, but not as ugly as when I wrote it down the first time. It's all about manipulating text files, to be precise tab-delimited files. All the snippets are published under the [GNU GPL](http://www.gnu.org/licenses/gpl.html) v2 (not that I think that anyone would use them, but just in case...).
 | |
| <!-- more --> I started with one file, listing [Entrez Gene](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene) identifiers for a number of genes coming out of a statistical analysis, along with their [SAM](http://www-stat.stanford.edu/~tibs/SAM/) scores to see if they were differentially expressed or not. I needed to add information such as gene name and symbol, [Gene Ontology](http://www.geneontology.org), and [Uniprot/SwissProt](http://www.expasy.uniprot.org/) IDs, leaving out the statistical paramters. I tried at first to use the EUtils package of [Bioptyhon](http://biopython.org), but due to both my lack of skill and the total lack of documentation, I dropped the idea and moved to a different plan.
 | |
| 
 | |
| First, I used [DAVID](http://niaid.abcc.ncifcrf.gov/) to obtain all the annotation data I needed. There are some columns that are redundant, so I decided to remove them as well. Once I had the original files (with the SAM data) and the DAVID results, I could start:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   1. 
 | |
| 
 | |
| 
 | |
| import sys
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   2. 
 | |
| 
 | |
| 
 | |
| import csv
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   3. 
 | |
| 
 | |
| 
 | |
| import re
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   4. 
 | |
| 
 | |
| 
 | |
| import tempfile
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| I used csv to easily handle comma-delimited files, tempfile to handle temporary files securely, sys to get command line arguments and re to do some regular expression matching and substituting. Basically, I had a series of functions that first of all obtained the SAM data and encoded them (1, up-regulation; 0, not differentially expressed; -1 down-regulation), creating a dictionary:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   1. 
 | |
| 
 | |
| 
 | |
| def getSAMFlag(file):
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   2. 
 | |
| 
 | |
| 
 | |
| sam_dict={}
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   3. 
 | |
| 
 | |
| 
 | |
| for row in file:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   4. 
 | |
| 
 | |
| 
 | |
| if row[0] == "locuslink":
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   5. 
 | |
| 
 | |
| 
 | |
| continue
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   6. 
 | |
| 
 | |
| 
 | |
| if row[22] != "NA":
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   7. 
 | |
| 
 | |
| 
 | |
| if float(row[22]) > 0:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   8. 
 | |
| 
 | |
| 
 | |
| row[22] = "1"
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   9. 
 | |
| 
 | |
| 
 | |
| elif float(row[22]) < 0:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   10. 
 | |
| 
 | |
| 
 | |
| row[22] = "-1"
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   11. 
 | |
| 
 | |
| 
 | |
| else:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   12. 
 | |
| 
 | |
| 
 | |
| row[22] = "0"
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   13. 
 | |
| 
 | |
| 
 | |
| sam_dict[row[0]]=row[22]
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   14. 
 | |
| 
 | |
| 
 | |
| return sam_dict
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| I had also to prepare the file coming out of DAVID, stripping the useless fields. As csv.reader gives an iterator  that returns a row for each cycle, it turned out to be quite easy:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   1. 
 | |
| 
 | |
| 
 | |
| def displayColumns(file,dest,cards=0):
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   2. 
 | |
| 
 | |
| 
 | |
| file_csv = csv.reader(file,dialect="ncbi")
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   3. 
 | |
| 
 | |
| 
 | |
| dest_csv = csv.writer(dest,dialect="ncbi")
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   4. 
 | |
| 
 | |
| 
 | |
| for row in file_csv:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   5. 
 | |
| 
 | |
| 
 | |
| if cards == 1:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   6. 
 | |
| 
 | |
| 
 | |
| if row[4] =="GENE_SYMBOL":
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   7. 
 | |
| 
 | |
| 
 | |
| row = row[0:2] + row[4:8] + row[3:4]
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   8. 
 | |
| 
 | |
| 
 | |
| dest_csv.writerow(row)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   9. 
 | |
| 
 | |
| 
 | |
| continue
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   10. 
 | |
| 
 | |
| 
 | |
| geneCardsURL = "< URL removed >"
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   11. 
 | |
| 
 | |
| 
 | |
| preURL = "< a xhref=\""
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   12. 
 | |
| 
 | |
| 
 | |
| postURL = "\">"
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   13. 
 | |
| 
 | |
| 
 | |
| endURL = "< /a>"
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   14. 
 | |
| 
 | |
| 
 | |
| row[4] = re.sub(", ","",row[4]) # Togliamo la virgola e lo spazio da fine colonna
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   15. 
 | |
| 
 | |
| 
 | |
| row[4] = preURL + geneCardsURL + row[4] + postURL + row[4] + endURL
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   16. 
 | |
| 
 | |
| 
 | |
| if row[3] == "":
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   17. 
 | |
| 
 | |
| 
 | |
| row[3] = "N/A"
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   18. 
 | |
| 
 | |
| 
 | |
| row = row[0:2] + row[4:8] + row[3:4]
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   19. 
 | |
| 
 | |
| 
 | |
| dest_csv.writerow(row)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| The optional "cards" parameter creates a HTML to link to the [GeneCards](http://www.genecards.org) database in order to query gene symbols. I removed the URL just for formatting purposes (and for some reason "a href" becomes "a xhref"), but it's easy to fetch it by querying by gene symbol. This code creates a new table with Entrez Gene ID, Gene Name, Gene Symbol, chromosome and cytoband and also the GO Cellular Component level 3 (adding "N/A" if there is no annotation). The re.sub is used to remove a comma followed by a space that is present at the end of the Gene Symbol annotation.Once I had all of this, I wrote a function to write the SAM results into this new table:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   1. 
 | |
| 
 | |
| 
 | |
| def writeSAM(file,data_file,dest):
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   2. 
 | |
| 
 | |
| 
 | |
| file_csv = csv.reader(file,dialect="ncbi")
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   3. 
 | |
| 
 | |
| 
 | |
| dest_csv = csv.writer(dest,dialect="ncbi")
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   4. 
 | |
| 
 | |
| 
 | |
| data_file_csv = csv.reader(data_file,dialect="ncbi")
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   5. 
 | |
| 
 | |
| 
 | |
| sam_dict = getSAMFlag(data_file_csv)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   6. 
 | |
| 
 | |
| 
 | |
| sam_keys = sam_dict.keys()
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   7. 
 | |
| 
 | |
| 
 | |
| for row in file_csv:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   8. 
 | |
| 
 | |
| 
 | |
| if row[0] == "ENTREZ_GENE_ID":
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   9. 
 | |
| 
 | |
| 
 | |
| row.append("Flag SAM")
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   10. 
 | |
| 
 | |
| 
 | |
| if row[0] in sam_keys:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   11. 
 | |
| 
 | |
| 
 | |
| row.append(sam_dict[row[0]])
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   12. 
 | |
| 
 | |
| 
 | |
| dest_csv.writerow(row)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| The if for ENTREZ_GENE_ID is used to add a header ("Flag SAM") to the columns. There's nothing much to say about the actual program, if not pointing out the very easy creation of temporary files:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   1. 
 | |
| 
 | |
| 
 | |
| temp = tempfile.NamedTemporaryFile()
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| And last but not least, the class definition of the dialect "ncbi" that I used to parse the text:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   1. 
 | |
| 
 | |
| 
 | |
| class ncbi:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   2. 
 | |
| 
 | |
| 
 | |
| delimiter = '\t'
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   3. 
 | |
| 
 | |
| 
 | |
| quotechar = '"'
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   4. 
 | |
| 
 | |
| 
 | |
| escapechar = None
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   5. 
 | |
| 
 | |
| 
 | |
| doublequote = True
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   6. 
 | |
| 
 | |
| 
 | |
| skipinitialspace = False
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   7. 
 | |
| 
 | |
| 
 | |
| lineterminator = '\n'
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   8. 
 | |
| 
 | |
| 
 | |
| quoting = csv.QUOTE_NONE
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| This is invoked using the csv.register_dialect method after instantiating:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   1. 
 | |
| 
 | |
| 
 | |
| ncbi = ncbi()
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|   2. 
 | |
| 
 | |
| 
 | |
| dial = csv.register_dialect("ncbi", ncbi)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| Even though my programming style is probably bad, I have to notice that the code I presented is not in the order it appears in the script (obviously). In any case, if there are any suggestions to improve, let me know.
 |