from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

file_path = '/content/drive/MyDrive/Wukong Data Test/adjusted_learning_data_with_summary_contents.csv'

df = pd.read_csv(file_path)
df.head()

# 将 session_date 转换为日期格式
df['session_date'] = pd.to_datetime(df['session_date'])

# 按 user_id 和 session_date 排序，确保行为顺序正确
df = df.sort_values(['user_id', 'session_date'])

# 计算每个用户下一次 session 的日期（向下取一行）
df['next_session_date'] = df.groupby('user_id')['session_date'].shift(-1)

# 计算当前学习与下一次学习之间的间隔天数
df['days_until_next'] = (df['next_session_date'] - df['session_date']).dt.days

# 判断是否为“次日回访”
df['next_day_retained'] = df['days_until_next'] == 1

# 只保留前5次学习，用于分析免费阶段的留存情况
df_retention = df[df['session_index'] <= 5].copy()

# 示例：判断用户是否至少学习过 5 次
user_stats = df.groupby("user_id").agg({
    'session_index': 'max',
    'paid': 'max'
})
user_stats['high_retention'] = user_stats['session_index'] >= 5

# 按学习次数分组，计算每次后的次留人数和总人数
retention_summary = df_retention.groupby('session_index')['next_day_retained'].agg(['count', 'sum'])

# 计算留存率，并保留3位小数
retention_summary['retention_rate'] = (retention_summary['sum'] / retention_summary['count']).round(3)

# 为绘图做准备，重置索引
retention_plot = retention_summary.reset_index()

# 设置画布大小
plt.figure(figsize=(8, 5))

# 绘制折线图
plt.plot(retention_plot['session_index'], retention_plot['retention_rate'],
         marker='o', linewidth=2, color='orange')

# 添加标题和标签（英文）
plt.title('Next-Day Retention Rate by Session Index', fontsize=14)
plt.xlabel('Session Index (Learning Times)', fontsize=12)
plt.ylabel('Next-Day Retention Rate', fontsize=12)

# 设置 X 轴刻度
plt.xticks(retention_plot['session_index'])

# 设置 Y 轴范围
plt.ylim(0, 1)

# 添加网格和优化布局
plt.grid(True)
plt.tight_layout()

# 显示图像
plt.show()

# 只保留退出的记录（exit_flag=True）
exit_df = df[df['exit_flag'] == True].copy()

# 每个内容的退出总人数
exit_count = exit_df.groupby('content_id').size().rename('exit_count')

# 每个内容退出后，次日未回来的人数（即 next_day_retained=False）
exit_lost_count = exit_df[exit_df['next_day_retained'] == False].groupby('content_id').size().rename('exit_and_churn_count')

# 合并数据
exit_analysis = pd.concat([exit_count, exit_lost_count], axis=1).fillna(0)

# 计算退出后的“次日流失率”
exit_analysis['churn_rate_after_exit'] = (exit_analysis['exit_and_churn_count'] / exit_analysis['exit_count']).round(3)

# 重置索引，便于查看
exit_analysis = exit_analysis.reset_index()

# 查看
exit_analysis

# 按流失率排序，方便看出谁最严重
exit_analysis_sorted = exit_analysis.sort_values(by='churn_rate_after_exit', ascending=False)

# 设置画布大小
plt.figure(figsize=(10, 6))

# 绘制横向条形图
plt.barh(exit_analysis_sorted['content_id'], exit_analysis_sorted['churn_rate_after_exit'], color='tomato')

# 添加标题和标签
plt.title('Next-Day Churn Rate After Exit by Content', fontsize=14)
plt.xlabel('Churn Rate After Exit', fontsize=12)
plt.ylabel('Content ID', fontsize=12)

# 添加网格和布局
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# 显示图像
plt.show()

# 每个模块的总出现次数（不管是否退出）
total_appearance = df['content_id'].value_counts().rename('total_count')

# 合并到之前的退出分析表中
exit_analysis = exit_analysis.merge(total_appearance, on='content_id')

# 计算退出率
exit_analysis['exit_rate'] = (exit_analysis['exit_count'] / exit_analysis['total_count']).round(3)

# 按退出率排序
exit_rate_sorted = exit_analysis.sort_values(by='exit_rate', ascending=False)

# 设置画布大小
plt.figure(figsize=(10, 6))

# 绘制横向条形图
plt.barh(exit_rate_sorted['content_id'], exit_rate_sorted['exit_rate'], color='steelblue')

# 添加标题和标签
plt.title('Exit Rate by Content', fontsize=14)
plt.xlabel('Exit Rate', fontsize=12)
plt.ylabel('Content ID', fontsize=12)

# 添加网格和布局优化
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# 显示图像
plt.show()

# 按 correct_rate 分桶
df_exit_rate = df.copy()
df_exit_rate['correct_bin'] = pd.cut(df_exit_rate['correct_rate'],
                                     bins=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                     include_lowest=True)

# 计算每个正确率区间的退出率
exit_by_correct = df_exit_rate.groupby('correct_bin')['exit_flag'].mean().round(3).reset_index()
exit_by_correct.rename(columns={'exit_flag': 'exit_rate'}, inplace=True)

# 显示结果表
exit_by_correct

/tmp/ipython-input-57-490851674.py:8: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  exit_by_correct = df_exit_rate.groupby('correct_bin')['exit_flag'].mean().round(3).reset_index()

# 正确率区间 vs 退出率
plt.figure(figsize=(8, 5))
bars = plt.bar(exit_by_correct['correct_bin'].astype(str),
               exit_by_correct['exit_rate'],
               color='royalblue')

plt.title('Exit Rate by Correct Rate Interval', fontsize=14)
plt.xlabel('Correct Rate Interval', fontsize=12)
plt.ylabel('Exit Rate', fontsize=12)
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# 添加柱子上的数值标签
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 0.02,
             f'{height:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

# 将正确率按区间分桶（例如 0.5~0.6, 0.6~0.7, ..., 0.9~1.0）
df_retention_rate = df.copy()
df_retention_rate['correct_bin'] = pd.cut(df_retention_rate['correct_rate'],
                    bins=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                    include_lowest=True)

# 对每个正确率区间，计算 next_day_retained 的均值
correct_retention = df_retention_rate.groupby('correct_bin')['next_day_retained'].mean().round(3).reset_index()
correct_retention.rename(columns={'next_day_retained': 'avg_next_day_retention'}, inplace=True)

# 显示结果
correct_retention

/tmp/ipython-input-59-135271496.py:8: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  correct_retention = df_retention_rate.groupby('correct_bin')['next_day_retained'].mean().round(3).reset_index()

# 正确率 vs 次日留存率
plt.figure(figsize=(8, 5))
bars = plt.bar(correct_retention['correct_bin'].astype(str),
               correct_retention['avg_next_day_retention'],
               color='mediumseagreen')

plt.title('Next-Day Retention Rate by Correct Rate', fontsize=14)
plt.xlabel('Correct Rate Interval', fontsize=12)
plt.ylabel('Next-Day Retention Rate', fontsize=12)
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# 添加数值标签
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 0.02,
             f'{height:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

# 先为每个用户添加 “是否最终付费” 标签（只保留1条记录）
user_paid = df.groupby('user_id')['paid'].max().reset_index().rename(columns={'paid': 'final_paid'})

# 计算每位用户的平均正确率
user_correct = df.groupby('user_id')['correct_rate'].mean().reset_index().rename(columns={'correct_rate': 'avg_correct_rate'})

# 合并两者
correct_vs_paid = pd.merge(user_correct, user_paid, on='user_id')

# 将正确率按区间分桶
correct_vs_paid['correct_bin'] = pd.cut(correct_vs_paid['avg_correct_rate'],
                                        bins=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                        include_lowest=True)

# 每个区间内的付费率
paid_by_correct = correct_vs_paid.groupby('correct_bin')['final_paid'].mean().round(3).reset_index()
paid_by_correct.rename(columns={'final_paid': 'paid_rate'}, inplace=True)

# 显示结果
paid_by_correct

/tmp/ipython-input-61-1343709252.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  paid_by_correct = correct_vs_paid.groupby('correct_bin')['final_paid'].mean().round(3).reset_index()

# 平均正确率 vs 最终付费率
plt.figure(figsize=(8, 5))
bars = plt.bar(paid_by_correct['correct_bin'].astype(str),
               paid_by_correct['paid_rate'],
               color='salmon')

plt.title('Final Paid Rate by Avg Correct Rate', fontsize=14)
plt.xlabel('Avg Correct Rate Interval (per user)', fontsize=12)
plt.ylabel('Paid Rate', fontsize=12)
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# 添加数值标签
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 0.02,
             f'{height:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

# 按 content_id 分组，计算平均学习时长、退出率、留存率
content_stats = df.groupby('content_id').agg(
    avg_duration=('duration', 'mean'),
    exit_rate=('exit_flag', 'mean'),
    retention_rate=('next_day_retained', 'mean')
).round(3).reset_index()

# 查看结果
content_stats.sort_values(by='avg_duration', ascending=False)

# 不同内容的平均学习时长 vs 退出率
plt.figure(figsize=(8, 5))
sns.scatterplot(data=content_stats,
                x='avg_duration',
                y='exit_rate',
                hue='content_id',
                s=100)

plt.title('Avg Duration vs Exit Rate by Content')
plt.xlabel('Average Duration (minutes)')
plt.ylabel('Exit Rate')
plt.grid(True)
plt.tight_layout()
plt.show()

# 不同章节的平均学习时长 vs 次日留存率
plt.figure(figsize=(8, 5))
sns.scatterplot(data=content_stats,
                x='avg_duration',
                y='retention_rate',
                hue='content_id',
                s=100)

plt.title('Avg Duration vs Next-Day Retention Rate by Content')
plt.xlabel('Average Duration (minutes)')
plt.ylabel('Retention Rate')
plt.grid(True)
plt.tight_layout()
plt.show()

# 查看退出率分布
content_stats['exit_rate'].describe()

# 查看平均学习时长分布
content_stats['avg_duration'].describe()

# 按退出率降序查看前3名
content_stats.sort_values('exit_rate', ascending=False).head(3)

# 筛选退出率高于 0.78，学习时长大于 8.9 的 (太难花了太多时间不想继续)
suspicious = content_stats[
    (content_stats['exit_rate'] > 0.78) &
    (content_stats['avg_duration'] > 8.9)
]
suspicious

# 添加可疑标记列
content_stats['is_suspicious'] = (
    (content_stats['exit_rate'] > 0.78) &
    (content_stats['avg_duration'] > 8.9)
)

plt.figure(figsize=(8, 5))
sns.scatterplot(data=content_stats,
                x='avg_duration',
                y='exit_rate',
                hue='is_suspicious',
                style='content_id',
                s=100)

plt.title('Avg Duration vs Exit Rate by Content')
plt.xlabel('Average Duration')
plt.ylabel('Exit Rate')
plt.grid(True)
plt.tight_layout()
plt.show()

# 按用户聚合生成用户画像
user_summary = df.groupby("user_id").agg({
    "duration": "mean",               # 平均学习时长
    "correct_rate": "mean",             # 平均正确率
    "exit_flag": "mean",              # 平均退出率
    "next_session_date": "count",          # 学习次数
    "paid": "max"                  # 是否最终付费
}).rename(columns={
    "duration": "avg_duration",
    "correct_rate": "avg_correct_rate",
    "exit_flag": "exit_rate",
    "next_session_date": "session_count",
    "paid": "paid"
}).reset_index()

# 显示前几行
user_summary.head()

# 简单定义：学习次数 >= 3 视为高留存
user_summary['high_retention'] = user_summary['session_count'] >= 3

# 可视化高留存 vs 流失的特征对比
# 设置画布为 1 行 2 列
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 第一张图：Correct Rate
sns.boxplot(data=user_summary, x='high_retention', y='avg_correct_rate', ax=axes[0])
axes[0].set_title('Correct Rate by Retention')
axes[0].set_xlabel('High Retention')
axes[0].set_ylabel('Average Correct Rate')

# 第二张图：Avg Duration
sns.boxplot(data=user_summary, x='high_retention', y='avg_duration', ax=axes[1])
axes[1].set_title('Avg Duration by Retention')
axes[1].set_xlabel('High Retention')
axes[1].set_ylabel('Average Duration')

# 布局优化
plt.tight_layout()
plt.show()

# 选择特征列
X = user_summary[['avg_duration', 'avg_correct_rate', 'exit_rate']]
y = user_summary['high_retention']

# 拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 训练逻辑回归
model = LogisticRegression()
model.fit(X_train, y_train)

# 查看特征重要性（系数）
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
})
coefficients.sort_values(by='Coefficient', ascending=False)

	user_id	session_date	session_index	duration	content_id	correct_rate	paid	next_session_date	exit_flag
0	1	2025-06-01	1	6.4	content_3	0.78	False	2025-06-02	True
1	1	2025-06-02	2	14.8	content_1	0.79	False	2025-06-02	True
2	1	2025-06-02	3	7.1	content_4	0.62	False	2025-06-02	False
3	1	2025-06-02	4	11.9	content_8	0.72	False	NaN	True
4	2	2025-06-01	1	10.6	content_1	0.75	False	2025-06-01	False

	content_id	exit_count	exit_and_churn_count	churn_rate_after_exit
0	content_1	325	240	0.738
1	content_2	274	204	0.745
2	content_3	324	252	0.778
3	content_4	302	211	0.699
4	content_5	270	195	0.722
5	content_6	318	245	0.770
6	content_7	299	213	0.712
7	content_8	284	200	0.704
8	content_9	275	177	0.644

	correct_bin	exit_rate
0	(0.499, 0.6]	0.818
1	(0.6, 0.7]	0.805
2	(0.7, 0.8]	0.820
3	(0.8, 0.9]	0.789
4	(0.9, 1.0]	0.828

	correct_bin	avg_next_day_retention
0	(0.499, 0.6]	0.244
1	(0.6, 0.7]	0.230
2	(0.7, 0.8]	0.220
3	(0.8, 0.9]	0.195
4	(0.9, 1.0]	0.231

	correct_bin	paid_rate
0	(0.499, 0.6]	0.109
1	(0.6, 0.7]	0.174
2	(0.7, 0.8]	0.249
3	(0.8, 0.9]	0.191
4	(0.9, 1.0]	0.066

图表解读：Next-Day Retention Rate by Session Index¶

图表解读：Next-Day Churn Rate After Exit by Content¶

图表解读：不同内容的退出率与次日流失率分析¶

图表解读：正确率区间与退出率的关系¶

图表解读：正确率区间与次日留存率的关系¶

图表解读：用户平均正确率与最终付费率的关系¶

图表解读：不同内容的平均学习时长与退出率关系¶

图表解读：不同内容的平均学习时长与次日留存率关系¶

图表解读：平均停留时长与退出率（含是否异常）¶

图表解读：高留存用户的行为特征¶

特征重要性分析：逻辑回归模型结果¶

	content_id	avg_duration	exit_rate	retention_rate
7	content_8	9.194	0.791	0.234
3	content_4	9.064	0.774	0.233
1	content_2	9.036	0.753	0.192
2	content_3	8.876	0.913	0.203
8	content_9	8.849	0.743	0.265
6	content_7	8.834	0.793	0.228
4	content_5	8.812	0.754	0.209
5	content_6	8.679	0.893	0.205
0	content_1	8.677	0.898	0.235

	exit_rate
count	9.000000
mean	0.812444
std	0.068928
min	0.743000
25%	0.754000
50%	0.791000
75%	0.893000
max	0.913000

	user_id	avg_duration	avg_correct_rate	exit_rate	session_count	paid
0	1	10.050	0.7275	0.750000	3	False
1	2	9.600	0.8200	0.666667	2	False
2	3	4.800	0.6600	1.000000	0	False
3	4	9.675	0.7475	0.750000	3	False
4	5	8.750	0.7900	1.000000	1	False

	Feature	Coefficient
1	avg_correct_rate	0.517502
0	avg_duration	0.066054
2	exit_rate	-2.943294

特征名称	系数 (Coefficient)
avg_correct_rate	0.518
avg_duration	0.066
exit_rate	-2.943