[1]:

import pandas as pd
import numpy as np
import plotly.express as px

[2]:

file_path = "/home/huzr/pisa2015_science/pisa2015_science/cog_science.csv"
df = pd.read_csv(file_path, low_memory=False)

Data Description¶

Field	annotation
CNTRYID	The country which the student comes from
CNTSTUID	Student ID
Region	The region which the student comes from
STRATUM	Types of students classified by school and place of birth, etc.
SUBNATIO	Further division of country
ADMINMODE	The means by which the student answer questions(paper or computer)
LANGTEST_COG	The language which the student uses
BOOKID	Form ID
CBASCI	Science Cluster Combination Random Number(S)
CS601Q01S, CS601Q02S … DS626Q04C	Question ID

Record Examples¶

[3]:

df.head()

[3]:

	CNTRYID	CNTSTUID	Region	STRATUM	SUBNATIO	ADMINMODE	LANGTEST_COG	BOOKID	CBASCI	DS269Q01C	...	CS601Q01S	CS601Q02S	CS601Q04S	DS610Q01C	CS610Q02S	CS610Q04S	CS626Q01S	CS626Q02S	CS626Q03S	DS626Q04C
0	Albania	803627	Albania	ALB - stratum 05: Urban \ South \ Public	Albania	Paper	Albanian	Form 27 (PBA)	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Albania	800454	Albania	ALB - stratum 05: Urban \ South \ Public	Albania	Paper	Albanian	Form 9 (PBA)	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Albania	800893	Albania	ALB - stratum 05: Urban \ South \ Public	Albania	Paper	Albanian	Form 18 (PBA)	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Albania	804180	Albania	ALB - stratum 05: Urban \ South \ Public	Albania	Paper	Albanian	Form 10 (PBA)	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Albania	800491	Albania	ALB - stratum 05: Urban \ South \ Public	Albania	Paper	Albanian	Form 19 (PBA)	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 192 columns

[4]:

print("{} Students from {} regions around the world participate in this assessment.".format(len(df['CNTSTUID']), len(df['Region'].unique())))

519334 Students from 118 regions around the world participate in this assessment.

The columns begin with “DS” or “CS” are the questions.

[5]:

question_ids = [col for col in df.columns if 'DS' in col or 'CS' in col]
print("There are {} questions involved.".format(len(question_ids)))

There are 183 questions involved.

Data cleaning¶

Drop useless columns¶

[6]:

useless_columns = ["CBASCI", "BOOKID", "ADMINMODE", "SUBNATIO", "LANGTEST_COG"]
df.drop(columns=useless_columns, inplace=True)
df.head()

[6]:

	CNTRYID	CNTSTUID	Region	STRATUM	DS269Q01C	DS269Q03C	CS269Q04S	CS408Q01S	DS408Q03C	CS408Q04S	...	CS601Q01S	CS601Q02S	CS601Q04S	DS610Q01C	CS610Q02S	CS610Q04S	CS626Q01S	CS626Q02S	CS626Q03S	DS626Q04C
0	Albania	803627	Albania	ALB - stratum 05: Urban \ South \ Public	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Albania	800454	Albania	ALB - stratum 05: Urban \ South \ Public	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Albania	800893	Albania	ALB - stratum 05: Urban \ South \ Public	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Albania	804180	Albania	ALB - stratum 05: Urban \ South \ Public	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Albania	800491	Albania	ALB - stratum 05: Urban \ South \ Public	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 187 columns

Transform the result to score¶

[7]:

def transform_to_number(ans):
    if isinstance(ans, float):
        return np.NaN
    if 'No credit' in ans:
        return 0
    if 'Partial credit' in ans:
        return 1
    if 'Full credit' in ans:
        return 2
    if 'No Response' in ans:
        return np.NaN
    if 'Not Reached' in ans:
        return np.NaN
    if 'Not Applicable' in ans:
        return np.NaN

    return np.NaN

for q in question_ids:

    df[q] = df[q].map(transform_to_number)

df_question = df[question_ids]

[8]:

df_difficulty = df_question.transpose().mean(1) / 2
# define difficulty equals average score divided by full score(2)
df_question = df_question / df_difficulty
# The more difficult questions will bring more score

[9]:

print("5 least answered questions:")
df_question.count().nsmallest()

5 least answered questions:

[9]:

DS327Q02C        0
DS438Q03C    38225
DS425Q04C    39228
DS524Q07C    39777
DS268Q02C    39958
dtype: int64

The question “DS327Q02C” is bad, because no one has given the correct answer.

[10]:

df_question.drop(columns=['DS327Q02C'], inplace=True)
print("The number of students who answer each question:")
df_question.count()

The number of students who answer each question:

[10]:

DS269Q01C    44197
DS269Q03C    43324
CS269Q04S    48415
CS408Q01S    48440
DS408Q03C    42613
             ...
CS610Q04S    91081
CS626Q01S    92980
CS626Q02S    91846
CS626Q03S    91033
DS626Q04C    89762
Length: 182, dtype: int64

Calculate average score for each student¶

[11]:

df_student = df.copy()
df_student['Avg score'] = df_question.mean(1)
df_student['Question count'] = df_question.count(1)
df_student.drop(columns=question_ids, inplace=True)
df_student.drop(df_student[df_student['Question count'] == 0].index, inplace=True)

[12]:

print("{} students give at least one valid answer.".format(len(df_student)))

446051 students give at least one valid answer.

[13]:

df_student.head()

[13]:

	CNTRYID	CNTSTUID	Region	STRATUM	Avg score	Question count
10734	Australia	3610676	Australia	AUS - stratum 21: QLD-Gov, Y10	2.945319	19
10735	Australia	3611874	Australia	AUS - stratum 21: QLD-Gov, Y10	2.693977	27
10736	Australia	3601769	Australia	AUS - stratum 21: QLD-Gov, Y10	2.982012	23
10737	Australia	3605996	Australia	AUS - stratum 21: QLD-Gov, Y10	2.330750	32
10738	Australia	3608147	Australia	AUS - stratum 21: QLD-Gov, Y10	1.752320	32

Sort questions by average score¶

[27]:

q_top20 = df_difficulty.nlargest(20)
fig = px.bar(
    q_top20.iloc[::-1],
    labels={"value": "Difficulty", "index": "Question Name"},
    orientation='h',
    title="Top 20 easy questions"
            )
fig.update_layout(showlegend=False)
fig.show('svg')

../../_images/build_blitz_pisa2015math_23_0.svg

[14]:

q_bottom20 = df_difficulty.nsmallest(20)
fig = px.bar(
    q_bottom20.iloc[::-1],
    labels={"value": "Difficulty", "index": "Question Name"},
    orientation='h',
    title="Top 20 difficult questions"
)
fig.update_layout(showlegend=False)
fig.show('svg')

../../_images/build_blitz_pisa2015math_24_0.svg

Sort regions by average score¶

[15]:

df_region = \
    df_student[["Region", "Avg score", "Question count"]] \
        .groupby("Region").mean().rename(columns={"Avg score": "Region avg score"})

[16]:

df_region.describe()

[16]:

	Region avg score	Question count
count	103.000000	103.000000
mean	1.988160	29.004939
std	0.397446	1.280883
min	0.822475	22.413025
25%	1.782955	28.684978
50%	2.119420	29.295016
75%	2.259783	29.780795
max	2.635470	30.473813

[17]:

region_top20 = df_region["Region avg score"].nlargest(20)
fig = px.bar(
    region_top20.iloc[::-1],
    labels={"value": "Region average Score", "index": "Region"},
    orientation='h',
    title="Top 20 regions"
)
fig.update_layout(showlegend=False)
fig.show("svg")

../../_images/build_blitz_pisa2015math_28_0.svg

[18]:

region_bottom20 = df_region["Region avg score"].nsmallest(20)
fig = px.bar(
    region_bottom20.iloc[::-1],
    labels={"index": "Region", "value": "Region"},
    orientation="h",
    title="Bottom 20 regions"
)
fig.update_layout(showlegend=False)
fig.show("svg")

../../_images/build_blitz_pisa2015math_29_0.svg

The distrbution of scores¶

[19]:

df_student.describe()

[19]:

	CNTSTUID	Avg score	Question count
count	4.460510e+05	446051.000000	446051.000000
mean	4.689134e+07	1.955394	28.581337
std	3.016672e+07	1.059917	5.101046
min	3.600001e+06	0.000000	1.000000
25%	1.880574e+07	1.101785	26.000000
50%	4.400022e+07	1.816121	29.000000
75%	7.520527e+07	2.694062	33.000000
max	9.731129e+07	8.116385	55.000000

[21]:

def level(score):
    return int(score * 10)

df_student_level = df_student["Avg score"].apply(level)
df_student_level_dist = df_student_level.groupby(df_student_level).size()
fig = px.bar(
    df_student_level_dist,
    labels={"value": "Students counts", "index": "Score range"},
    title="Score distribution",
    hover_data={
        "variable":False,
    }
)
fig.update_layout(showlegend=False)
fig.show('svg')

df_student_level_dist

../../_images/build_blitz_pisa2015math_32_0.svg

[21]:

Avg score
0     2055
1     2159
2     4048
3     6144
4     8811
      ...
67       1
70       1
72       1
79       1
81       3
Name: Avg score, Length: 72, dtype: int64

Sort students by average score¶

[22]:

df_student["CNTSTUID"] = df_student["CNTSTUID"].apply(str)
s_top500 = \
    df_student[df_student["Question count"] >= 10] \
        .nlargest(500, ["Avg score"]).set_index("CNTSTUID")
# We only select students who have answered more than 10 questions
s_top500.index = s_top500.index.map(str)
fig = px.bar(
    s_top500[:20].iloc[::-1],
    x = "Avg score",
    orientation="h",
    title="The score of top 20 students"
)
fig.update_layout(yaxis_type='category')
fig.update_layout(showlegend=False)
fig.show('svg')

../../_images/build_blitz_pisa2015math_34_0.svg

[24]:

s_top500_dist = s_top500.groupby("CNTRYID").size()
s_top500_dist = s_top500_dist[s_top500_dist > 10]
fig = px.pie(
    values=s_top500_dist,
    names=s_top500_dist.index,
    title="Top 500 students come from"
)
fig.show('svg')

../../_images/build_blitz_pisa2015math_35_0.svg

[25]:

s_bottom500 = df_student[(df_student["Question count"] > 10) & (df_student["Avg score"] > 0.1)]\
    .nsmallest(500, "Avg score")
# A too low score indicates that the student may answer questions blindly
s_bottom500.set_index("CNTSTUID")
s_bottom500.index = s_bottom500.index.map(str)
fig = px.bar(
    s_bottom500.iloc[:20].iloc[::-1],
    x = "Avg score",
    orientation="h",
    title="The score of bottom 20 students"
)
fig.update_layout(yaxis_type='category')
fig.show('svg')

../../_images/build_blitz_pisa2015math_36_0.svg

[26]:

s_bottom500_dist = s_bottom500.groupby("CNTRYID").size()
s_bottom500_dist = s_bottom500_dist[s_bottom500_dist > 10]
fig = px.pie(
    names=s_bottom500_dist.index,
    values=s_bottom500_dist,
    title="Bottom 500 students come from",
)
fig.show('svg')

../../_images/build_blitz_pisa2015math_37_0.svg

[ ]: