From b7667f0331d1391200cc02a25b1a0d777b7ff833 Mon Sep 17 00:00:00 2001 From: tingwen Date: Tue, 26 Apr 2022 14:29:04 +0000 Subject: [PATCH] =?UTF-8?q?=E5=9B=9E=E9=80=80=20'Pull=20Request=20!16=20:?= =?UTF-8?q?=20=E9=99=88=E6=96=AF=E5=9D=A6=20=E4=BD=9C=E4=B8=9A'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- HW-mid(1).ipynb | 532 ---------------------------------------- HW-re-exercise(2).ipynb | 324 ------------------------ 2 files changed, 856 deletions(-) delete mode 100644 HW-mid(1).ipynb delete mode 100644 HW-re-exercise(2).ipynb diff --git a/HW-mid(1).ipynb b/HW-mid(1).ipynb deleted file mode 100644 index 2cc3764..0000000 --- a/HW-mid(1).ipynb +++ /dev/null @@ -1,532 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 期中作业" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- 数据:2020年美国债券交易市场数据,截取了前5000条\n", - " - ..\\lecture-python\\raw-data\\bond_intraday_trade.csv\n", - "- 使用的变量(variables)\n", - " - `'cusip_id'`: the unique ID of the bond\n", - " - `'trd_exctn_dt'`: the date of the bond trade\n", - " - `'trd_exctn_tm'`: the time of the bond trade\n", - " - `'rptd_pr'`: reported price\n", - "- 目标:计算 Daily Roll’s Measure (衡量债券流动性的指标)\n", - "- **问题**:\n", - " 1. 请指出下列计算Roll's measure code 中的一个明显的bug (重复出现)\n", - " 2. 提出一个优化coding的思路并进行修改" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Daily Roll’s Measure\n", - "\n", - "1. delete all records of which the cusip_id is missing\n", - "2. sort the data by cusip_id, trade date\n", - "3. Within each trade date, for each bond, calculate return \n", - " $$R_t = \\frac{P_t - P_{t-1}}{P_{t-1}}$$\n", - "4. within each trade date, for each bond, the roll’s measure is\n", - " $$\\text{Roll's measure} = 2\\sqrt{-cov(R_t,R_{t-1})}$$\n", - "\n", - " - if daily cov > 0, there are 4 ways to deal with it:\n", - " - roll's measure = na\n", - " - roll's measure = 0\n", - " - take the square root without applying the negative sign and treat the result as a negative spread.\n", - " - treat positive covariances as if they are negative, resulting in a positive Roll spread estimate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculate Daily Roll’s Measure" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### import data and quick check" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv('traceclean.csv') #请自己修改路径" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# data.head()\n", - "# using columns:\n", - " # cusip_id\n", - " # trd_exctn_dt\n", - " # trd_exctn_tm\n", - " # rptd_pr\n", - "data_use = data[['cusip_id','trd_exctn_dt','trd_exctn_tm','rptd_pr']]\n", - "data_use.columns=['id','date','hour','price']" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatehourprice
000037BAB82020031311:14:41100.351
100037BAB82020033113:04:45101.019
200037BAB82020040710:07:24101.285
300037BAB82020020316:40:25102.660
400037BAB82020020316:40:25102.910
\n", - "
" - ], - "text/plain": [ - " id date hour price\n", - "0 00037BAB8 20200313 11:14:41 100.351\n", - "1 00037BAB8 20200331 13:04:45 101.019\n", - "2 00037BAB8 20200407 10:07:24 101.285\n", - "3 00037BAB8 20200203 16:40:25 102.660\n", - "4 00037BAB8 20200203 16:40:25 102.910" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_use.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 10000 entries, 0 to 9999\n", - "Data columns (total 4 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 10000 non-null object \n", - " 1 date 10000 non-null int64 \n", - " 2 hour 10000 non-null object \n", - " 3 price 10000 non-null float64\n", - "dtypes: float64(1), int64(1), object(2)\n", - "memory usage: 312.6+ KB\n", - "--------------------\n" - ] - }, - { - "data": { - "text/plain": [ - "count 10000.000000\n", - "mean 105.358831\n", - "std 9.676449\n", - "min 49.900000\n", - "25% 101.000000\n", - "50% 104.000000\n", - "75% 108.183750\n", - "max 182.013000\n", - "Name: price, dtype: float64" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_use.info()\n", - "print('-'*20)\n", - "data_use['price'].describe() # 这里使用的是部分数据,也许没有价格=0的数据\n", - "# min price = 0, check?\n" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id 0\n", - "date 0\n", - "hour 0\n", - "price 0\n", - "dtype: int64\n" - ] - } - ], - "source": [ - "# check price = 0\n", - "print(data_use[data_use['price']==0].count())\n", - "# okay, one row: drop this data\n", - "# data_use[data_use['price']==0]\n", - "# check quantile\n", - "# data_use.quantile(q=[0.01,0.02,0.03,0.04,0.05]) # drop more data?\n", - "\n", - "# check if there are missing value in bond code\n", - "#data_use['id'].isnull().any()\n", - "# no missing value for bond id\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "# first: drop price == 0\n", - "data_use2 = data_use.drop(index=(data_use.loc[(data_use['price']==0)].index))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### calculate intra day return" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 111441\n", - "1 130445\n", - "2 100724\n", - "3 164025\n", - "4 164025\n", - " ... \n", - "9995 170050\n", - "9996 143337\n", - "9997 143337\n", - "9998 143337\n", - "9999 135249\n", - "Name: hour_adj, Length: 10000, dtype: int32" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# first: adjust date format\n", - "# join hour-minute-second,then change type to int, just for sorting\n", - "data_use2['hour_adj'] = data_use2['hour'].str.replace(':','').astype('int')\n", - "data_use2['hour_adj']" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2020-03-13\n", - "1 2020-03-31\n", - "2 2020-04-07\n", - "3 2020-02-03\n", - "4 2020-02-03\n", - " ... \n", - "9995 2020-03-18\n", - "9996 2020-05-04\n", - "9997 2020-05-04\n", - "9998 2020-05-04\n", - "9999 2020-02-06\n", - "Name: date, Length: 10000, dtype: datetime64[ns]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# adjust date format\n", - "data_use2['date'] = pd.to_datetime(data_use2['date'],format='%Y%m%d')\n", - "data_use2['date']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# NOTE\n", - "# 时间相同,价格不同? 取均值处理" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Bug : 计算 rate = (df.price - df.price.shift(1))/df.price.shift(1) 改用temp而不是df" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "# count some trading info\n", - "low_trade_times = 0\n", - "positive_cov = 0\n", - "# calculate roll's measure\n", - "def roll_liquidity1(df):\n", - " # if cov> 0, return np.nan\n", - " global low_trade_times, positive_cov\n", - " # first: 时间相同,取价格mean\n", - " temp = pd.DataFrame(df.groupby('hour_adj')['price'].mean()).sort_index(ascending=True)#\n", - " # only calculate when size > 3\n", - " if temp.shape[0]>3:\n", - " rate = (temp.price - temp.price.shift(1))/temp.price.shift(1)\n", - " c = np.cov(rate[1:-1],rate[2:])[0,1] # bec 0 is na \n", - " if c < 0:\n", - " return np.sqrt(-c)*2\n", - " else:\n", - " positive_cov += 1\n", - " return np.nan\n", - " low_trade_times += 1\n", - " return [np.nan]*4\n", - "\n", - "\n", - "def roll_liquidity2(df):\n", - " # if cov> 0, return 0\n", - " #global low_trade_times, positive_cov\n", - " # first: 时间相同,取价格mean\n", - " temp = pd.DataFrame(df.groupby('hour_adj')['price'].mean()).sort_index(ascending=True)\n", - " # only calculate when size > 3\n", - " if temp.shape[0]>3:\n", - " rate = (temp.price - temp.price.shift(1))/temp.price.shift(1)\n", - " c = np.cov(rate[1:-1],rate[2:])[0,1] # bec 0 is na\n", - " if c < 0:\n", - " return np.sqrt(-c)*2\n", - " else:\n", - " #positive_cov += 1\n", - " return 0\n", - " #low_trade_times += 1\n", - " return np.nan # 日内价格数量<3, 赋值np.nan, 以区分cov<0,事后可以方便改为0若有需要\n", - "\n", - "\n", - "\n", - "def roll_liquidity3(df):\n", - " # if cov> 0, take the square root without applying the negative sign and treat the result as a negative spread.\n", - " #global low_trade_times, positive_cov\n", - " # first: 时间相同,取价格mean\n", - " temp = pd.DataFrame(df.groupby('hour_adj')['price'].mean()).sort_index(ascending=True)\n", - " # only calculate when size > 3\n", - " if temp.shape[0]>3:\n", - " rate = (temp.price - temp.price.shift(1))/temp.price.shift(1)\n", - " c = np.cov(rate[1:-1],rate[2:])[0,1] # bec 0 is na\n", - " if c < 0:\n", - " return np.sqrt(-c)*2\n", - " else:\n", - " #positive_cov += 1\n", - " return -np.sqrt(c)*2\n", - " #low_trade_times += 1\n", - " return np.nan # 日内价格数量<3, 赋值np.nan, 以区分cov<0,事后可以方便改为0若有需要\n", - "\n", - "\n", - "def roll_liquidity4(df):\n", - " # if cov> 0, treat positive covariances as if they are negative, resulting in a positive Roll spread estimate\n", - " #global low_trade_times, positive_cov\n", - " # first: 时间相同,取价格mean\n", - "\n", - " temp = pd.DataFrame(df.groupby('hour_adj')['price'].mean()).sort_index(ascending=True)\n", - " # only calculate when size > 3\n", - " if temp.shape[0]>3:\n", - " rate = (temp.price - temp.price.shift(1))/temp.price.shift(1)\n", - " c = np.cov(rate[1:-1],rate[2:])[0,1] # bec 0 is na\n", - " if c < 0:\n", - " return np.sqrt(-c)*2\n", - " else:\n", - " #positive_cov += 1\n", - " return np.sqrt(c)*2\n", - " #low_trade_times += 1\n", - " return np.nan # 日内价格数量<3, 赋值np.nan, 以区分cov<0,事后可以方便改为0若有需要\n" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(804, 115)" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "low_trade_times = 0\n", - "positive_cov = 0\n", - "roll_daily = pd.DataFrame(data_use2.groupby(['id','date']).apply(roll_liquidity1),columns = ['roll_liquidity_method1'])\n", - "low_trade_times, positive_cov" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [], - "source": [ - "roll_daily2 = pd.DataFrame(data_use2.groupby(['id','date']).apply(roll_liquidity2),columns = ['roll_liquidity_method2'])\n", - "roll_daily3 = pd.DataFrame(data_use2.groupby(['id','date']).apply(roll_liquidity3),columns = ['roll_liquidity_method3'])\n", - "roll_daily4 = pd.DataFrame(data_use2.groupby(['id','date']).apply(roll_liquidity4),columns = ['roll_liquidity_method4'])" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "result = pd.concat([roll_daily,roll_daily2,roll_daily3,roll_daily4],axis = 1)\n", - "result.reset_index(inplace=True)\n", - "result.rename(columns={'id':'cusip_id'},inplace = True)\n", - "result.to_csv('roll_daily_liquidity_measure_all.csv', index = False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/HW-re-exercise(2).ipynb b/HW-re-exercise(2).ipynb deleted file mode 100644 index eec9a81..0000000 --- a/HW-re-exercise(2).ipynb +++ /dev/null @@ -1,324 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 135, - "id": "f8612641", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import re\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "id": "f0e0b156", - "metadata": {}, - "outputs": [], - "source": [ - "cbond = pd.read_excel('cbond-interest-info.xlsx')\n", - "cbond.columns = ['code','name','interest_info']\n", - "cbond.drop(index=[0],inplace = True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "id": "a0f41e24", - "metadata": {}, - "outputs": [], - "source": [ - "list_code = list(cbond.code)\n", - "list_name = list(cbond.name)\n", - "list_info = list(cbond.interest_info)\n", - "n = 0\n", - "list_interest = []\n", - "\n", - "for i in range(0,len(list_info)):\n", - " list_info[i] = list_info[i].split('%')\n", - " for j in range(0,len(list_info[i])): \n", - " list_info[i][j] =re.sub(u\"([^\\u0030-\\u0039\\u002e\\uffe5])\", \"\", list_info[i][j])\n", - " if re.findall(r'\\b\\d+\\b', list_info[i][j]) != []:\n", - " list_info[i][j] = float(list_info[i][j])\n", - " if list_info[i][j] > 10:\n", - " list_info[i][j] = '' \n", - " else :\n", - " list_info[i][j] =str(list_info[i][j]) + '%'\n", - " else :\n", - " list_info[i][j] = ''\n", - " while '' in list_info[i]:\n", - " list_info[i].remove('')\n", - " if len(list_info[i]) >= n:\n", - " n = len(list_info[i])\n", - " \n", - "for i in range(0,n):\n", - " list_interest.append([])\n", - " \n", - "for i in range(0,len(list_info)):\n", - " ind = 0\n", - " for j in range(0,len(list_info[i])): \n", - " list_interest[ind].append(list_info[i][j])\n", - " ind += 1\n", - " if ind < n:\n", - " for a in range(ind,n):\n", - " list_interest[a].append(\"NA\")" - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "id": "db77333e", - "metadata": {}, - "outputs": [], - "source": [ - "data = {'name':list_name,'code':list_code}\n", - "for i in range(0,len(list_interest)):\n", - " data['interest_year'+str(i+1)] = list_interest[i]\n", - "df = pd.DataFrame(data)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "id": "67e41189", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namecodeinterest_year1interest_year2interest_year3interest_year4interest_year5interest_year6
0无锡转债110043.SH0.3%0.5%0.8%1.0%1.3%1.8%
1广电转债110044.SH0.4%0.6%1.0%1.5%1.8%2.0%
2海澜转债110045.SH0.3%0.5%0.8%1.0%1.3%1.8%
3山鹰转债110046.SH0.5%0.8%1.0%1.5%1.8%2.0%
4福能转债110047.SH0.4%0.6%1.0%1.5%2.0%3.0%
...........................
419旺能转债128141.SZ0.3%0.5%1.0%1.5%1.8%2.0%
420新乳转债128142.SZ0.3%0.5%1.0%1.5%1.8%2.0%
421锋龙转债128143.SZ0.5%0.7%1.2%1.8%2.5%3.0%
422利民转债128144.SZ0.3%0.5%0.8%1.0%1.5%2.0%
423日丰转债128145.SZ0.3%0.6%1.0%1.5%2.0%2.5%
\n", - "

424 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " name code interest_year1 interest_year2 interest_year3 \\\n", - "0 无锡转债 110043.SH 0.3% 0.5% 0.8% \n", - "1 广电转债 110044.SH 0.4% 0.6% 1.0% \n", - "2 海澜转债 110045.SH 0.3% 0.5% 0.8% \n", - "3 山鹰转债 110046.SH 0.5% 0.8% 1.0% \n", - "4 福能转债 110047.SH 0.4% 0.6% 1.0% \n", - ".. ... ... ... ... ... \n", - "419 旺能转债 128141.SZ 0.3% 0.5% 1.0% \n", - "420 新乳转债 128142.SZ 0.3% 0.5% 1.0% \n", - "421 锋龙转债 128143.SZ 0.5% 0.7% 1.2% \n", - "422 利民转债 128144.SZ 0.3% 0.5% 0.8% \n", - "423 日丰转债 128145.SZ 0.3% 0.6% 1.0% \n", - "\n", - " interest_year4 interest_year5 interest_year6 \n", - "0 1.0% 1.3% 1.8% \n", - "1 1.5% 1.8% 2.0% \n", - "2 1.0% 1.3% 1.8% \n", - "3 1.5% 1.8% 2.0% \n", - "4 1.5% 2.0% 3.0% \n", - ".. ... ... ... \n", - "419 1.5% 1.8% 2.0% \n", - "420 1.5% 1.8% 2.0% \n", - "421 1.8% 2.5% 3.0% \n", - "422 1.0% 1.5% 2.0% \n", - "423 1.5% 2.0% 2.5% \n", - "\n", - "[424 rows x 8 columns]" - ] - }, - "execution_count": 140, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c15e8cb", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c0d086a", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} -- Gitee