Working with the SAT data

1. Cleaning the data

,,2012 SAT Results,,,
DBN,SCHOOL NAME,# of SAT Test Takers,SAT Critical Reading Avg.,SAT Math Avg.,SAT Writing Avg.
02M047,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECONDARY SCHOOL,16,395,400,387
06M540,A. PHILIP RANDOLPH CAMPUS HIGH SCHOOL,228,430,456,423
21K410,ABRAHAM LINCOLN HIGH SCHOOL,475,396,437,393
21K412/21K411,ABRAHAM LINCOLN YABC/LEARNING TO WORK GED AT ABRAHAM LINCOLN,s,s,s,s
30Q301,ACADEMY FOR CAREERS IN TELEVISION AND FILM,98,410,440,405
17K382,ACADEMY FOR COLLEGE PREPARATION AND CAREER EXPLORATION: A COLLEGE BOARD SCHOOL,59,396,374,369
18K637,ACADEMY FOR CONSERVATION AND THE ENVIRONMENT,35,363,381,367
32K403,ACADEMY FOR ENVIRONMENTAL LEADERSHIP,50,380,366,352
17K751,ACADEMY FOR HEALTH CAREERS,s,s,s,s
09X365,ACADEMY FOR LANGUAGE AND TECHNOLOGY,54,315,339,297
11X270,ACADEMY FOR SCHOLARSHIP AND ENTREPRENEURSHIP: A COLLEGE BOARD SCHOOL,56,386,394,361
05M367,ACADEMY FOR SOCIAL ACTION: A COLLEGE BOARD SCHOOL,33,354,366,342
14K404,ACADEMY FOR YOUNG WRITERS,68,387,357,381
30Q575,ACADEMY OF AMERICAN STUDIES,135,487,492,491
13K336,ACADEMY OF BUSINESS AND COMMUNITY DEVELOPMENT,9,439,374,418
04M635,ACADEMY OF ENVIRONMENTAL SCIENCE SECONDARY HIGH SCHOOL,48,369,369,357
24Q264,ACADEMY OF FINANCE AND ENTERPRISE,89,405,454,421
17K408,ACADEMY OF HOSPITALITY AND TOURISM,57,352,342,351
19K618,ACADEMY OF INNOVATIVE TECHNOLOGY,60,371,371,354
27Q309,ACADEMY OF MEDICAL TECHNOLOGY: A COLLEGE BOARD SCHOOL,36,367,379,348
32K552,ACADEMY OF URBAN PLANNING,67,342,364,354
13K499,ACORN COMMUNITY HIGH SCHOOL,72,384,364,368
07X600,ALFRED E. SMITH CAREER AND TECHNICAL EDUCATION HIGH SCHOOL,76,394,400,364
32K554,ALL CITY LEADERSHIP SECONDARY SCHOOL,39,428,465,422
08X376,"ANTONIA PANTOJA PREPARATORY ACADEMY, A COLLEGE BOARD SCHOOL",s,s,s,s
08X367,"ARCHIMEDES ACADEMY FOR MATH, SCIENCE AND TECHNOLOGY APPLICATIONS",s,s,s,s
02M630,ART AND DESIGN HIGH SCHOOL,270,444,441,430
...
09X505,"BRONX SCHOOL FOR LAW, GOVERNMENT AND JUSTICE",80,404,418,402
 
def CleanSATData(infile, outfile):
    '''Read and clean the SAT data, then write it out'''
    try:
        fin = open(infile,'r')
        raw_string = fin.read()
        raw_lines = raw_string.split('\n')
        fin.close()
    except:
        print 'ERROR: cannot read '+infile
        return
    
    clean_lines = []   # the cleaned data lines
    for raw_line in raw_lines:
        raw_line = raw_line.strip()  # in case there are trailing '\r' chars
        fields = raw_line.split(',')
        
        # Reject if there aren't enough fields
        if len(fields) < 6: continue
        
        # Reject if any one of the last 4 fields is not integers 
        allFieldsNumeric = True
        for i in [-4,-3,-2,-1]:
            if not fields[i].isdigit():
                allFieldsNumeric = False
        if not allFieldsNumeric: continue
        
        # now deal with the school names inside double-quotes:
        if '"' in raw_line:
            pos_1 = raw_line.find('"')  # find the positions of the quotes
            pos_2 = raw_line.find('"',pos_1+1)
            name = raw_line[pos_1+1:pos_2]  # extract the school name
            name = name.replace(',',';')    # change the school name
            raw_line = raw_line[:pos_1] + name + raw_line[pos_2+1:] # replace name, and removing quotes
            fields = raw_line.split(',')  # re-split the line into fields

        # create the clean line
        clean_line = ','.join(fields)
        clean_lines.append(clean_line)

    # write it out
    try:
        fout = open(outfile,'w')
        fout.write('\n'.join(clean_lines))
        fout.close()
    except:
        print 'ERROR: cannot write to '+outfile

def Create_dSchools(cleanFile):
    try:
        fin = open(cleanFile,'r')
        lines = fin.read().split('\n')
        fin.close()
    except:
        print 'ERROR: cannot read '+cleanFile
        return

    dSchools={}
    for line in lines:
        fields = line.split(',')
        
        # calculate total SAT score
        totalSAT = 0
        for i in [-3,-2,-1]:
            totalSAT += int(fields[i])

        dSchools[fields[1]] = totalSAT

    return dSchools
def DictInvert(indict):
    ''' Invert the given dictionary and return it'''
    outdict={}
    for key in indict:
        value = indict[key]
        if value in outdict:
            outdict[value].append(key)
        else:
            outdict[value] = [key]
    return outdict
def PrintSATSchools(cleanFile):
    dSchools = Create_dSchools(cleanFile)
    dSAT = DictInvert(dSchools)
    keys = dSAT.keys()
    keys.sort()
    print 'Highest SAT: ',keys[-1],dSAT[keys[-1]]
    median = len(keys)/2
    print 'Median SAT: ',keys[median],dSAT[keys[median]]
    print 'Lowest SAT: ',keys[0],dSAT[keys[0]]