In [15]: values = pd.Series([0, 1, 0, 0] * 2)
In [16]: dim = pd.Series(['apple', 'orange'])
In [17]: values
Out[17]:
0 0
1 1
2 0
3 0
4 0
5 1
6 0
7 0
dtype: int64
In [18]: dim
Out[18]:
0 apple
1 orange
dtype: object
In [19]: dim.take(values)
Out[19]:
0 apple
1 orange
0 apple
0 apple
0 apple
1 orange
0 apple
0 apple
dtype: object
In [20]: fruits = ['apple', 'orange', 'apple', 'apple'] * 2
In [21]: N = len(fruits)
In [22]: df = pd.DataFrame({'fruit': fruits,
....: 'basket_id': np.arange(N),
....: 'count': np.random.randint(3, 15, size=N),
....: 'weight': np.random.uniform(0, 4, size=N)},
....: columns=['basket_id', 'fruit', 'count', 'weight'])
In [23]: df
Out[23]:
basket_id fruit count weight
0 0 apple 5 3.858058
1 1 orange 8 2.612708
2 2 apple 4 2.995627
3 3 apple 7 2.614279
4 4 apple 12 2.990859
5 5 orange 8 3.845227
6 6 apple 5 0.033553
7 7 apple 4 0.425778
In [24]: fruit_cat = df['fruit'].astype('category')
In [25]: fruit_cat
Out[25]:
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]
In [26]: c = fruit_cat.values
In [27]: type(c)
Out[27]: pandas.core.categorical.Categorical
In [28]: c.categories
Out[28]: Index(['apple', 'orange'], dtype='object')
In [29]: c.codes
Out[29]: array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)
In [30]: df['fruit'] = df['fruit'].astype('category')
In [31]: df.fruit
Out[31]:
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]
In [32]: my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
In [33]: my_categories
Out[33]:
[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]
In [34]: categories = ['foo', 'bar', 'baz']
In [35]: codes = [0, 1, 2, 0, 0, 1]
In [36]: my_cats_2 = pd.Categorical.from_codes(codes, categories)
In [37]: my_cats_2
Out[37]:
[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]
In [38]: ordered_cat = pd.Categorical.from_codes(codes, categories,
....: ordered=True)
In [39]: ordered_cat
Out[39]:
[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo < bar < baz]
In [40]: my_cats_2.as_ordered()
Out[40]:
[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo < bar < baz]
In [41]: np.random.seed(12345)
In [42]: draws = np.random.randn(1000)
In [43]: draws[:5]
Out[43]: array([-0.2047, 0.4789, -0.5194, -0.5557, 1.9658])
In [53]: N = 10000000
In [54]: draws = pd.Series(np.random.randn(N))
In [55]: labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))
In [56]: categories = labels.astype('category')
In [57]: labels.memory_usage()
Out[57]: 80000080
In [58]: categories.memory_usage()
Out[58]: 10000272
In [59]: %time _ = labels.astype('category')
CPU times: user 490 ms, sys: 240 ms, total: 730 ms
Wall time: 726 ms
In [60]: s = pd.Series(['a', 'b', 'c', 'd'] * 2)
In [61]: cat_s = s.astype('category')
In [62]: cat_s
Out[62]:
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (4, object): [a, b, c, d]
In [65]: actual_categories = ['a', 'b', 'c', 'd', 'e']
In [66]: cat_s2 = cat_s.cat.set_categories(actual_categories)
In [67]: cat_s2
Out[67]:
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (5, object): [a, b, c, d, e]
In [68]: cat_s.value_counts()
Out[68]:
d 2
c 2
b 2
a 2
dtype: int64
In [69]: cat_s2.value_counts()
Out[69]:
d 2
c 2
b 2
a 2
e 0
dtype: int64
In [70]: cat_s3 = cat_s[cat_s.isin(['a', 'b'])]
In [71]: cat_s3
Out[71]:
0 a
1 b
4 a
5 b
dtype: category
Categories (4, object): [a, b, c, d]
In [72]: cat_s3.cat.remove_unused_categories()
Out[72]:
0 a
1 b
4 a
5 b
dtype: category
Categories (2, object): [a, b]
In [74]: pd.get_dummies(cat_s)
Out[74]:
a b c d
0 1 0 0 0
1 0 1 0 0
2 0 0 1 0
3 0 0 0 1
4 1 0 0 0
5 0 1 0 0
6 0 0 1 0
7 0 0 0 1
In [75]: df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
....: 'value': np.arange(12.)})
In [76]: df
Out[76]:
key value
0 a 0.0
1 b 1.0
2 c 2.0
3 a 3.0
4 b 4.0
5 c 5.0
6 a 6.0
7 b 7.0
8 c 8.0
9 a 9.0
10 b 10.0
11 c 11.0
In [77]: g = df.groupby('key').value
In [78]: g.mean()
Out[78]:
key
a 4.5
b 5.5
c 6.5
Name: value, dtype: float64