"""This is an annotated version of functions that we wrote in
class to parse supp2data.csv and find the Pearson correlation
between two arbitrary genes."""

# We will use my versions of the stats functions from day 1
#   (available in stats1.py on the website)
from stats1 import mean, stdev, pearson

# ======================================================================
# Here is a single-function version based on Tali's homework solution:
# ======================================================================

def pair_correlation(filename, line1, line2):
    """Return the Pearson correlation between the genes on
    line1 and line2 of filename

    where:
       line1 and line2 are integer offsets, from 0
    and
       filename is a string, giving the name of the file to open.
    """

    # Buffer the whole file as a list of lines
    f = open(filename).readlines()

    # Sanity check -- make sure line1 and line2 are valid offsets
    #   into the file
    assert((line1 < len(f)) and (line2 < len(f)))

    # Break each line into comma-separated elements.
    words1 = f[line1].split(",")
    words2 = f[line2].split(",")
    #   The first two elements are name and annotation, so we
    #   throw them out (in class, we combined this with the
    #   previous step)
    ratios1 = words1[2:]
    ratios2 = words2[2:]

    #   Now we can check that both lines are the same length
    #     (we forgot to do this in class)
    assert(len(ratios1) == len(ratios2))

    # Initialize lists to hold the parsed log ratios from each line
    d1 = []
    d2 = []
    # Fill the lists up with parsed values.  We use zip to walk over
    #   two lists in parallel:
    for (i,j) in zip (ratios1, ratios2):
        #  We try to covert each element to a floating-point number
        try:
            d1.append(float(i))
        #  If the element is "" (due to missing data), this will fail,
        #     raising an exception
        except:
            #  We handle the missing data case by setting it to 0.0
            #    (what else could we do here?)
            d1.append(0.0)

        #  We handle the second line in the same way
        try:
            d2.append(float(j))
        except:
            d2.append(0.0)

    # Now we are ready to calculate the requested correlation
    return pearson(d1,d2)

# ======================================================================
# Here is the same protocol, refactored into three functions as
#   we did in class
# ======================================================================

def coerce_log_ratio(x):
    """Convert a string to a floating point value.
    Return 0.0 for missing or unparsable data.
    """
    try:
        return float(x)
    except:
        return 0.0

def parse_line(line):
    """Return a list of floating point log2 ratios corresponding
    to a line of text from a file formatted like supp2data.csv.
    """
    # Break the line into comma-separated elements, toss
    #    out the first two (name and annotation), and coerce
    #    the rest to floats
    return [coerce_log_ratio(i) for i in line.split(",")[2:]]

def pair_correlation2(filename, line1, line2):
    """Return the Pearson correlation between the genes on
    line1 and line2 of filename

    where:
       line1 and line2 are integer offsets, from 0
    and
       filename is a string, giving the name of the file to open.
    """

    # Buffer the whole file as a list of lines
    f = open(filename).readlines()

    # Sanity check -- make sure line1 and line2 are valid offsets
    #   into the file
    assert((line1 < len(f)) and (line2 < len(f)))

    # Parse the lines
    d = [parse_line(f[i]) for i in (line1,line2)] 

    #   Now we can check that both lines are the same length
    #     (we forgot to do this in class)
    assert(len(d[0]) == len(d[1]))

    # Now we are ready to calculate the requested correlation
    return pearson(d[0],d[1])

# ======================================================================
# Let's use csv.reader instead of split(",") to get a correct parse
#   of the file.  Note that we don't need to change coerce_log_ratio.
# ======================================================================

# skipping "2" to avoid confusion
def parse_line3(line):
    """Return a list of floating point log2 ratios corresponding
    to a list of text fields from a file formatted like supp2data.csv.
    """
    # Toss out the first two (name and annotation), and coerce the
    #   rest to floats
    return [coerce_log_ratio(i) for i in line[2:]]

def pair_correlation3(filename, line1, line2):
    """Return the Pearson correlation between the genes on
    line1 and line2 of filename

    where:
       line1 and line2 are integer offsets, from 0
    and
       filename is a string, giving the name of the file to open.
    """

    # Buffer the whole file as a list of lines
    from csv import reader
    f = [i for i in reader(open(filename))]

    # Sanity check -- make sure line1 and line2 are valid offsets
    #   into the file
    assert((line1 < len(f)) and (line2 < len(f)))

    # Parse the lines
    d = [parse_line3(f[i]) for i in (line1,line2)] 

    #   Now we can check that both lines are the same length
    #     (we forgot to do this in class)
    assert(len(d[0]) == len(d[1]))

    # Now we are ready to calculate the requested correlation
    return pearson(d[0],d[1])

# ======================================================================
#   As a final variation, let's see if we can avoid buffering the
#     whole file.  We'll also give the user a little more control
#     over how to handle missing values.
# ======================================================================

def coerce_log_ratio4(x, missing = None):
    """Convert a string to a floating point value.
    Return missing (default = None) for missing or unparsable data.
    """
    try:
        return float(x)
    except:
        return missing

def parse_line4(line, missing = None):
    """Return a list of floating point log2 ratios corresponding
    to a list of text fields from a file formatted like supp2data.csv.
    Use missing as the value for missing data (default = None).
    """
    # Toss out the first two (name and annotation), and coerce the
    #   rest to floats
    return [coerce_log_ratio4(i, missing) for i in line[2:]]

def pair_correlation4(filename, line1, line2, missing = None):
    """Return the Pearson correlation between the genes on
    line1 and line2 of filename

    where:
       line1 and line2 are integer offsets, from 0
       filename is a string, giving the name of the file to open.
       missing is a floating point number to use for missing
          values (e.g., 0.0).  If missing is None (default),
          missing data will be ignored and only columns with
          data for both genes will be used to calculate the
          correlation.
    """

    # Make place-holders for the two lines
    d = [None,None]

    # Scan the whole file, parsing the target lines as we
    #   come to them.  (enumerate is a generator similar to
    #   zip -- it keeps a running count of the number of
    #   elements that have been generated)
    from csv import reader
    for (line_number, line) in enumerate(reader(open(filename))):
        for (i, offset) in enumerate((line1, line2)):
            if(offset == line_number):
                d[i] = parse_line4(line, missing)
        # Do we have everything we need?
        if(None not in d):
            break

    # Sanity check -- make sure line1 and line2 were valid offsets
    #   into the file
    assert(None not in d)

    #   Now we can check that both lines are the same length
    #     (we forgot to do this in class)
    assert(len(d[0]) == len(d[1]))

    # Since we are allowing missing values, we need to deal
    #   with them here -- remove any position that doesn't
    #   have data in both vectors
    culled = [[],[]]
    for pair in zip(d[0],d[1]):
        if(None not in pair):
            culled[0].append(pair[0])
            culled[1].append(pair[1])

    if(len(culled[0]) < 1):
        raise ValueError, "No data for pearson calculation"

    # Now we are ready to calculate the requested correlation
    return pearson(culled[0],culled[1])

# If we are being run as a script (rather than imported)
# then run some test cases to check that our functions work
if(__name__ == "__main__"):
    filename = "supp2data.csv"
    line1 = 116
    line2 = 229

    # For lines that are not corrupted by our original parsing,
    #   the first three implementations should give the same result
    #   (we'll check the first 9 decimal places)
    for f in (pair_correlation,
              pair_correlation2,
              pair_correlation3):
        r = f(filename, line1, line2)
        assert(abs(r - 0.1668160531) < 1e-10)
        print r

    # The fourth implementation should match the others when
    #    set to match their missing data behavior
    r = pair_correlation4(filename, line1, line2, missing = 0.0)
    assert(abs(r - 0.1668160531) < 1e-10)
    print r

    # And it may give a different value when rejecting missing data
    r = pair_correlation4(filename, line1, line2)
    assert(abs(r - 0.1751798316) < 1e-10)
    print r
