sample = pd.DataFrame({"event_time" :['2019-10-01 00:04:47 UTC','2019-10-01 00:04:49 UTC',
                                   '2019-10-01 00:04:51 UTC','2019-10-01 00:04:53 UTC',
                                   '2019-10-01 00:04:54 UTC','2019-10-01 00:04:56 UTC',
                                   '2019-10-01 00:04:58 UTC','2019-10-01 00:05:01 UTC',
                                   '2019-10-01 00:05:03 UTC','2019-10-01 00:05:03 UTC',
                                   '2019-10-01 00:05:06 UTC','2019-10-01 00:05:07 UTC',
                                   '2019-10-01 00:05:10 UTC','2019-10-01 00:05:10 UTC',
                                   '2019-10-01 00:05:13 UTC','2019-10-01 00:05:13 UTC',
                                   '2019-10-01 00:05:14 UTC','2019-10-01 00:05:15 UTC',
                                   '2019-10-01 00:05:16 UTC','2019-10-01 00:05:17 UTC'], 
          "event_type": ['view', 'view', 'view', 'view','view','view','view','view',
                         'view','view','view','view','view','view','view','view',
                         'cart','view','view','view'], 
          "product_id": [5100816, 10800048, 15700176,17300555,12703015,2601543, 
                         1004966,1005115,1004284,4300285,26204088,13400134,19000265,
                         1002532,2700609,26400672,5100816,4300262,28101002,26300087],
           "category_id": [2053013553375346967,2053013554994348409,2053013559733912211,
                           2053013553853497655,2053013553559896355,2053013563970159485,
                           2053013555631882655,2053013555631882655,2053013555631882655,
                           2053013552385491165,2053013563693335403,2053013557066334713,
                           2053013557225718275,2053013555631882655,2053013563911439225,
                           2053013563651392361,2053013553375346967,2053013552385491165,
                           2053013564918072245,2053013563584283495],

            "user_id": [550121407, 539194858, 552373753, 520002471, 555448033, 513173999, 
                        523208885, 529755884, 513840435, 544648245, 515547943, 552795065, 
                        525734504, 551377651, 502372361, 555447748, 550121407, 523239174, 
                        537858238, 555448236],
})

                 event_time event_type  product_id          category_id    user_id
0   2019-10-01 00:04:47 UTC       view     5100816  2053013553375346967  550121407
1   2019-10-01 00:04:49 UTC       view    10800048  2053013554994348409  539194858
2   2019-10-01 00:04:51 UTC       view    15700176  2053013559733912211  552373753
3   2019-10-01 00:04:53 UTC       view    17300555  2053013553853497655  520002471
4   2019-10-01 00:04:54 UTC       view    12703015  2053013553559896355  555448033
5   2019-10-01 00:04:56 UTC       view     2601543  2053013563970159485  513173999
6   2019-10-01 00:04:58 UTC       view     1004966  2053013555631882655  523208885
7   2019-10-01 00:05:01 UTC       view     1005115  2053013555631882655  529755884
8   2019-10-01 00:05:03 UTC       view     1004284  2053013555631882655  513840435
9   2019-10-01 00:05:03 UTC       view     4300285  2053013552385491165  544648245
10  2019-10-01 00:05:06 UTC       view    26204088  2053013563693335403  515547943
11  2019-10-01 00:05:07 UTC       view    13400134  2053013557066334713  552795065
12  2019-10-01 00:05:10 UTC       view    19000265  2053013557225718275  525734504
13  2019-10-01 00:05:10 UTC       view     1002532  2053013555631882655  551377651
14  2019-10-01 00:05:13 UTC       view     2700609  2053013563911439225  502372361
15  2019-10-01 00:05:13 UTC       view    26400672  2053013563651392361  555447748
16  2019-10-01 00:05:14 UTC       cart     5100816  2053013553375346967  550121407
17  2019-10-01 00:05:15 UTC       view     4300262  2053013552385491165  523239174
18  2019-10-01 00:05:16 UTC       view    28101002  2053013564918072245  537858238
19  2019-10-01 00:05:17 UTC       view    26300087  2053013563584283495  555448236

J'ai le DataFrame ci-dessus, soit au total environ 14 Go. Ce que je dois faire, c'est compter combien de fois un utilisateur a "vu" un produit avant de l'ajouter à son panier. Le problème est que souvent l'utilisateur "visualisera" le produit après l'avoir ajouté dans le panier et même l'avoir acheté. Cela signifie que je ne peux pas simplement les compter. J'ai essayé ce code :

products = list(data.loc[data["event_type"] == "cart"]["product_id"])
users = list(data.loc[data["event_type"] == "cart"]["user_id"])
idx_carting = list(data.loc[data["event_type"] == "cart"].index) 

views_count = []

for i in range(len(idx_carting)):
    sub_df = data.iloc[:idx_carting[i]]
    views_count.append(len(sub_df.loc[(sub_df["product_id"] == products[i]) 
                                  & (sub_df["user_id"]==users[i])])) 

Mais c'est beaucoup trop lourd, je l'ai laissé tourner plus de 2 heures, et il n'avait toujours pas terminé la tâche. Quelqu'un a-t-il un moyen plus efficace de le faire?

0
Bebba 2 nov. 2020 à 11:08

1 réponse

Meilleure réponse

Vous feriez mieux de sélectionner les éléments de vos données que vous souhaitez réellement examiner, de les regrouper par ces identifiants de produit et d'utilisateur, puis d'obtenir les décomptes. Ce qui suit ne compte que les vues avant le premier achat. Ainsi, les achats multiples du même article ne sont pas du tout comptabilisés.

J'ai ajouté une ligne de données supplémentaire pour montrer que les événements de vue après le premier achat ne sont pas comptés.

import pandas as pd

sample = pd.DataFrame({"event_time" :['2019-10-01 00:04:47 UTC','2019-10-01 00:04:49 UTC',
                                   '2019-10-01 00:04:51 UTC','2019-10-01 00:04:53 UTC',
                                   '2019-10-01 00:04:54 UTC','2019-10-01 00:04:56 UTC',
                                   '2019-10-01 00:04:58 UTC','2019-10-01 00:05:01 UTC',
                                   '2019-10-01 00:05:03 UTC','2019-10-01 00:05:03 UTC',
                                   '2019-10-01 00:05:06 UTC','2019-10-01 00:05:07 UTC',
                                   '2019-10-01 00:05:10 UTC','2019-10-01 00:05:10 UTC',
                                   '2019-10-01 00:05:13 UTC','2019-10-01 00:05:13 UTC',
                                   '2019-10-01 00:05:14 UTC','2019-10-01 00:05:15 UTC',
                                   '2019-10-01 00:05:16 UTC','2019-10-01 00:05:17 UTC',
                                   '2019-10-01 00:05:19 UTC'],
          "event_type": ['view', 'view', 'view', 'view','view','view','view','view',
                         'view','view','view','view','view','view','view','view',
                         'cart','view','view','view', 'view'],
          "product_id": [5100816, 10800048, 15700176,17300555,12703015,2601543,
                         1004966,1005115,1004284,4300285,26204088,13400134,19000265,
                         1002532,2700609,26400672,5100816,4300262,28101002,26300087,
                         5100816],
          "user_id": [550121407, 539194858, 552373753, 520002471, 555448033, 513173999,
                        523208885, 529755884, 513840435, 544648245, 515547943, 552795065,
                        525734504, 551377651, 502372361, 555447748, 550121407, 523239174,
                        537858238, 555448236, 550121407]

Ci-dessous, nous utilisons deux objets groupby qui partagent la même indexation. La boucle for est toujours lente, mais sera plus rapide que votre approche actuelle.

# get the list of product ids that are actually added to a cart
cart_products = sample.query('event_type=="cart"').product_id

# create groupings for each cart event for a user and product
# this is used to get the first cart event
gb_c = sample.query('event_type=="cart"').groupby(['user_id', 'product_id'])

# create a grouping of view events for each user and product where a 
# cart event also occurs in the data
gb_v = (
    sample[sample.product_id.isin(cart_products)]
        .query('event_type=="view"')
        .groupby(['user_id', 'product_id'])
)

# iterate over the groups, pull out the matching group, get the counts
counts = []
indices = []
for ix, g1 in gb_c:
    try:
        g2 = gb_v.get_group(ix)
    except KeyError:
        # handle rare case of an cart event with no view event
        continue
    counts.append(g2.loc[g2.event_time.lt(g1.event_time.iloc[0]), 'event_type'].count() )
    indices.append(ix)

# form a new data frame
df = pd.DataFrame(
    {'counts': counts}, 
    index=pd.MultiIndex.from_tuples(indices, names=['user_id', 'product_id'])

A titre d'exemple, voici une trame de données avec 50 millions de lignes, cela prend moins de 2 minutes sur mon ordinateur portable pour terminer.

import pandas as pd
import numpy as np

s = 50_000_000
df = pd.DataFrame({
    'event_type': np.random.choice(['view', 'cart'], size=s, p=(0.999, 0.001)),
    'product_id': np.random.randint(100000, 200000, size=s),
    'user_id': np.random.randint(100000, 150000, size=s)
    },
    index=pd.date_range('2020-01-01', periods=s, freq='mS', name='event_time')).reset_index()

cart_products = df.query('event_type=="cart"').product_id
gb_c = df.query('event_type=="cart"').groupby(['user_id', 'product_id'])
gb_v = (
    df[df.product_id.isin(cart_products)]
        .query('event_type=="view"')
        .groupby(['user_id', 'product_id'])
)

counts = []
indices = []
for ix, g1 in gb_c:
    try:
        g2 = gb_v.get_group(ix)
    except KeyError:
        continue
    counts.append(g2.loc[g2.event_time.lt(g1.event_time.iloc[0]), 'event_type'].count() )
    indices.append(ix)

pd.DataFrame({'counts': counts}, 
    index=pd.MultiIndex.from_tuples(indices, names=['user_id', 'product_id']))
0
James 2 nov. 2020 à 14:05