pibase | pibase_to_rdf | example pibase_to_rdf.sh


The linux bash shell script below assumes that some pibase files have already been created using the example download data, in a preceeding pibase_test.sh run. It performs pibase_fisherdiff for three remaining pairs of files, and then invokes pibase_to_rdf which creates the rdf file.

################################################################
#### STEP 1 
#### Create list files "trio.txt" and "trio_ref.txt" for pibase_to_rdf
#### (These files are already included in the example data download)

cat trio.txt
dummy	ILUNA12878
output/diff_cov5_gen_NA12878_NA12891_ILLUMINA.txt	NA12891
output/diff_cov5_gen_NA12878_NA12892_ILLUMINA.txt	NA12892
output/diff_cov5_gen_NA12878_ILLUMINA_SOLID.txt	SOLNA12878
output/diff_cov5_gen_NA12878_ILLUMINA_FLX.txt	FLXNA12878

cat trio_ref.txt
dummy	HG19REF
output/diff_cov5_gen_ref_NA12878_ILLUMINA.txt	NA12878
output/diff_cov5_gen_ref_NA12891_ILLUMINA.txt	NA12891
output/diff_cov5_gen_ref_NA12892_ILLUMINA.txt	NA12892
output/diff_cov5_gen_ref_NA12878_SOLID.txt	SOLNA12878
output/diff_cov5_gen_ref_NA12878_FLX.txt	FLXNA12878
################################################################


#!/bin/bash
#### STEP 2 
#### prepare pibase_fisherdiff files of 5 HapMap runs of 3 individuals
#### for Network analysis

# compare two files at a time (normally a control/case pair, e.g. normal/tumor tissue)
# Options: min coverage >= 5, p.median <= 0.05, factor <= 10

# compare daughter with her father
pibase_fisherdiff output/gen_NA12878_ILLUMINA.txt output/gen_NA12891_ILLUMINA.txt output/diff_cov5_gen_NA12878_NA12891_ILLUMINA.txt 5 0.05 10

# compare daughter with her mother
pibase_fisherdiff output/gen_NA12878_ILLUMINA.txt output/gen_NA12892_ILLUMINA.txt output/diff_cov5_gen_NA12878_NA12892_ILLUMINA.txt 5 0.05 10

# compare daughter with her SOLiD self
pibase_fisherdiff output/gen_NA12878_ILLUMINA.txt output/gen_NA12878_SOLID.txt output/diff_cov5_gen_NA12878_ILLUMINA_SOLID.txt 5 0.05 10

# compare daughter with her FLX self
pibase_fisherdiff output/gen_NA12878_ILLUMINA.txt output/gen_NA12878_FLX.txt output/diff_cov5_gen_NA12878_ILLUMINA_FLX.txt 5 0.05 10

#### STEP 3 
#### Create test.p20n.rdf for illustrative purposes:
#### using p <= 0.2 and no both-stranded confirmation

pibase_to_rdf trio.txt output/trio.p20n.rdf 0.2 n

# Version with eliminated "N"-columns and invariable columns
pibase_to_rdf  trio.txt  output/trio.p20n.elim.rdf 0.2 n y

#### STEP 4
#### Start Network 4.6.0.0
#### Calculate Network / Network Calculations / Median Joining
#### File / Open : trio.p20n.rdf / Calculate network. 

################################################################

### trio.p20n.rdf IS THE DATA FILE USED FOR THE FIGURE AND THE 
### MANUSCRIPT FOR DEMONSTRATION PURPOSES (AS THE SOLID'S COVERAGE WAS THIN).
### NETWORK 4.6.0.0 REPLACES 'N' (SEE BELOW) WITH 1/0 FROM THE
### CLOSEST SEQUENCE.

### FOR NETWORKS FROM DATA FROM THE SAME PLATFORM, WE RECOMMEND TO
### USE THE OPTION TO ELIMINATE Ns, AND TO TRY REFERENCE SAMPLE ALTERNATIVES
### (E.G. SEE BELOW)

### rdf file with "N"s and with invariable columns ("characters"):

cat output/trio.p20n.rdf
  ;1.0
1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33;34;35;36;37;38;39;40;41;42;43;44;45;46;47;48;49;50;51;52;53;54;
10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;
>ILUNA12878;1;;;;;;;
111111111111111111111111111111111111111111111111111111
>NA12891;1;;;;;;;
111110101111011111111111101011111111110111100111011111
>NA12892;1;;;;;;;
111111011111111010010011101111111011011101111110100011
>SOLNA12878;1;;;;;;;
1N1NNNNN1N1NNN11N1111111NNN1NNN110N111N10NN1N1NN1111NN
>FLXNA12878;1;;;;;;;
111111111111111111101111111111111111111101111111N11111

### rdf file where "N"s and invariable columns ("characters") were eliminated:

cat output/trio.p20n.elim.rdf
  ;1.0
18;20;21;22;28;34;37;41;44;50;51;52;
10;10;10;10;10;10;10;10;10;10;10;10;
>ILUNA12878;1;;;;;;;
111111111111
>NA12891;1;;;;;;;
111101110111
>NA12892;1;;;;;;;
010010001000
>SOLNA12878;1;;;;;;;
111110101111
>FLXNA12878;1;;;;;;;
101011101111





#####################################################################

#### ALTERNATIVE STEP 3:
#### Use a "clean" reference sample generated by pibase_rdf_ref

# generate this clean reference sample:
pibase_rdf_ref output/gen_NA12878_ILLUMINA.txt output/gen_refsample.txt 100

# compare the 5 samples pair-wise against this clean reference sample
pibase_fisherdiff output/gen_refsample.txt output/gen_NA12878_ILLUMINA.txt output/diff_cov5_gen_ref_NA12878_ILLUMINA.txt 5 0.05 10
pibase_fisherdiff output/gen_refsample.txt output/gen_NA12891_ILLUMINA.txt output/diff_cov5_gen_ref_NA12891_ILLUMINA.txt 5 0.05 10
pibase_fisherdiff output/gen_refsample.txt output/gen_NA12892_ILLUMINA.txt output/diff_cov5_gen_ref_NA12892_ILLUMINA.txt 5 0.05 10
pibase_fisherdiff output/gen_refsample.txt output/gen_NA12878_SOLID.txt output/diff_cov5_gen_ref_NA12878_SOLID.txt 5 0.05 10
pibase_fisherdiff output/gen_refsample.txt output/gen_NA12878_FLX.txt output/diff_cov5_gen_ref_NA12878_FLX.txt 5 0.05 10

# generate rdf file using p<=0.2 and no both-stranded validation to detect differences:
pibase_to_rdf  trio_ref.txt  output/trio_ref.p20n.rdf 0.2 n

# generate rdf file using p<=0.2 and no both-stranded validation to detect differences, and eliminate Ns and invariable characters:
pibase_to_rdf  trio_ref.txt  output/trio_ref.p20n.elim.rdf 0.2 n y

################################################################

### rdf file with "N"s and with invariable columns ("characters"):

cat output/trio_ref.p20n.rdf
  ;1.0
1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33;34;35;36;37;38;39;40;41;42;43;44;45;46;47;48;49;50;51;52;53;54;
10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;
>HG19REF;1;;;;;;;
111111111111111111111111111111111111111111111111111111
>NA12878;1;;;;;;;
111110001110011011001011101011101011010101111111000110
>NA12891;1;;;;;;;
111111011110011011001011111011101011010101100111100110
>NA12892;1;;;;;;;
111110001100011110000111101011101111010111111110011011
>SOLNA12878;1;;;;;;;
1N1NNNNN1N1NNN10N1011111NNN0NNN011N101N11NN1N1NN1001NN
>FLXNA12878;1;;;;;;;
111110101110011011001011101011101011010111111111N00110

### rdf file where "N"s and invariable columns ("characters") were eliminated:

cat output/trio.p20n.elim.rdf
  ;1.0
11;16;18;19;20;21;22;28;32;34;37;41;44;50;51;52;
10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;
>HG19REF;1;;;;;;;
1111111111111111
>NA12878;1;;;;;;;
1010010000001001
>NA12891;1;;;;;;;
1010010000000001
>NA12892;1;;;;;;;
0100001001011110
>SOLNA12878;1;;;;;;;
1010111001011001
>FLXNA12878;1;;;;;;;
1010010000011001


^top