RGBD based gaze实验记录

遇到的问题

1.ImportError: cannot import name ‘DtypeArg’ from ‘pandas._typing’

ImportError:无法从’pandas导入名称“DtypeArg” - 我爱学习网 (5axxw.com)

使用pip list查看

卸载重装即可。

2. File “/home/workspace/feifeizhang/anconda/lib/python3.7/site-packages/pandas/core/indexes/base.py”, line 3363, in get_loc raise KeyError(key) from err KeyError: ‘left_eye_coord’

1
2
self.le_coord_list = (root_dir + "/" + self.anno["left_eye_coord"]).tolist()
self.re_coord_list = (root_dir + "/" + self.anno["right_eye_coord"]).tolist()

源代码中这两行被注释掉了,但是如果注释掉会出现

le_coor = np.load(self.le_coord_list[idx])

AttributeError: ‘GazePointAllDataset’ object has no attribute ‘le_coord_list’

解决办法

已经发现了数据组织结构的问题。看代码gaze_dataset.py发现数据组织格式有问题,运行GAZE/RG-BD-Gaze-master/code/data/data_check.py有

Traceback (most recent call last):
File “/home/workspace/feifeizhang/GAZE/RGBD-Gaze-master/code/data/data_check.py”, line 1, in
from data.gaze_dataset import GazePointAllDataset
ModuleNotFoundError: No module named ‘data’

看一下cord的.npy文件,读出来。或者发邮件吧。

3.RuntimeError: expand(torch.cuda.FloatTensor{[2, 1]}, size=[]): the number of sizes provided (0) must be greater or equal to the number of dimensions in the tensor (2)

Traceback (most recent call last):
File “/home/workspace/feifeizhang/RGBDgaze/RGBD-Gaze-master/code/trainer_aaai.py”, line 666, in
trainer.train_base(epochs= total_epochs, lr=learning_rate,use_refined_depth=True)
File “/home/workspace/feifeizhang/RGBDgaze/RGBD-Gaze-master/code/trainer_aaai.py”, line 118, in train_base
self._train_base_epoch()
File “/home/workspace/feifeizhang/RGBDgaze/RGBD-Gaze-master/code/trainer_aaai.py”, line 411, in _train_base_epoch
left_eye_info[j, 2] = th.median(cur_depth).item() * face_factor
RuntimeError: expand(torch.cuda.FloatTensor{[2, 1]}, size=[]): the number of sizes provided (0) must be greater or equal to the number of dimensions in the tensor (2)

非常奇怪的是,当我把batchsize设置为1时,此问题消失了orz

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
if self.temps.use_refined_depth:
with th.no_grad():
left_eye_bbox[:, :2] -= face_bbox[:, :2]
left_eye_bbox[:, 2:] -= face_bbox[:, :2]
right_eye_bbox[:, :2] -= face_bbox[:, :2]
right_eye_bbox[:, 2:] -= face_bbox[:, :2]#减去坐上坐标,相当于是得到在face中眼睛的位置
left_eye_bbox = th.clamp(face_factor * left_eye_bbox, min=0, max=223)#用斜坡函数
right_eye_bbox = th.clamp(face_factor * right_eye_bbox, min=0, max=223)

for j, lb in enumerate(left_eye_bbox):
#left_eye_bbox torchsize[2,4]
#refined_depth torchsize[2,1,224,224]
cur_depth = refined_depth[j, :, int(lb[1]):int(lb[3]), int(lb[0]):int(lb[2])]
#cur_depth[1,86,86]
left_eye_info[j, 2] = th.median(cur_depth).item() * face_factor
#left_eye_info torchsize[2,3]
for j, rb in enumerate(right_eye_bbox):
cur_depth = refined_depth[j, :, int(rb[1]):int(rb[3]), int(rb[0]):int(rb[2])]
right_eye_info[j, 2] = th.median(cur_depth).item() * face_factor
1
2
3
4
5
6
7
8
9
10
11
12
13
        #打印出维度,查看结果
print("the shape of depth:",cur_depth.shape)
print("the shape of left_eye_info :",left_eye_info.shape)
print("left_eye_info[j, 2]",left_eye_info[j, 2])
a= th.median(cur_depth).item() * face_factor
print("median depth:",a)
print("median shape",a.shape)
>>the shape of depth: torch.Size([1, 89, 90])
>>the shape of left_eye_info : torch.Size([2, 3])
>>left_eye_info[j, 2] tensor(0.4084, device='cuda:0')
>>median depth: tensor([[0.2566],
[0.3694]], device='cuda:0')
>>median shape torch.Size([2, 1])

the shape of depth: torch.Size([1, 88, 88])
the shape of left_eye_info : torch.Size([16, 3])
left_eye_info[j, 2] tensor(0.9088, device=’cuda:0’)
median depth: tensor([[0.6016],
[0.6016],
[0.5006],
[0.5006],
[0.6016],
[0.6016],
[0.5006],
[0.5025],
[0.5006],
[0.4179],
[0.4179],
[0.5025],
[0.3485],
[0.5006],
[0.4179],
[0.5025]], device=’cuda:0’)
median shape torch.Size([16, 1])
Traceback (most recent call last):
File “/home/workspace/feifeizhang/RGBDgaze/RGBD-Gaze-master/code/trainer_aaai.py”, line 676, in
trainer.train_base(epochs= total_epochs, lr=learning_rate,use_refined_depth=True)
File “/home/workspace/feifeizhang/RGBDgaze/RGBD-Gaze-master/code/trainer_aaai.py”, line 119, in train_base
self._train_base_epoch()
File “/home/workspace/feifeizhang/RGBDgaze/RGBD-Gaze-master/code/trainer_aaai.py”, line 419, in _train_base_epoch
left_eye_info[j, 2] = th.mean(a).item() * face_factor
RuntimeError: expand(torch.cuda.FloatTensor{[16, 1]}, size=[]): the number of sizes provided (0) must be greater or equal to the number of dimensions in the tensor (2)

找到原因啦!其实是face_factor是这个batch的face_factor

程序结构

目录

image-20211019145428957

image-20211019154913500

image-20211025165956657

gen landmark.py

trainer_aaai.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class GazeTrainer(Trainer):
#继承Trainer类

self.weights_init(self.models.decoder)
def train_base(self, epochs, lr=1e-4, use_refined_depth=False, fine_tune_headpose=True):
def train_headpose(self, epochs, lr=2e-4, lambda_loss_mse=1):
self.temps.headpose_logger = self.logger.getChild('train_headpose')
self.temps.headpose_logger.info('preparing for headpose training loop.')
self.temps.headpose_logger = self.logger.getChild('train_headpose')
self.temps.headpose_logger.info('preparing for headpose training loop.')
#没看明白此处log来自于哪里?
# prepare logger
# prepare dataloader 调用def _get_trainloader(self):
# start training loop

def resume(self, filename):
def _prepare_model(self, model, train=True):
def _get_trainloader(self):
#data_transforms,
#调用 transformed_train_dataset = GazePointAllDataset(root_dir=self.data_root,transform=data_transforms['train'],phase='train',face_image=True, face_depth=True, eye_image=True,eye_depth=True,info=True, eye_bbox=True, face_bbox=True, eye_coord=True)
#参数有faceimage、depth、bbox,eye_image、eye_depth、eye_bbox、eye_coord)
#调用 gaze_dataset.py
def _get_valloader(self):

def _init_base_meters(self):
def _init_headpose_meters(self):

def _plot_base(self):
def _plot_headpose(self):

def _log_base(self):
def _log_headpose(self):

def _train_base_epoch(self):
def _train_headpose_epoch(selfs):

def _test_base(self):
def _test_headpose(self):

trainer = GazeTrainer(
exp_name="gaze_aaai_refine_headpose"
)

在调用GazeTrainer类后会继承父类Trainer,父类位于文件下GAZE/RGBD-Gaze-master/code/utils/trainer.py

1
2
def __init__(self, checkpoint_dir='./', is_cuda=True):#初始化

GAZE/RGBD-Gaze-master/code/utils/edict.py

1
#对于字典的编辑

gaze_aaai.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def resnet34(pretrained=False, **kwargs):
class Decoder(nn.Module):
#左右眼decoder
class DepthBCE(nn.Module):
class RefineDepth(nn.Module):
self.depth_block4
#faceblock1、2、3、4,depthblock1、2、3、4
#每一层输出的向量维度一样,卷积,bn,relu 。in3,每一个block out 64、128、256、512
self.down4
#4个down降低维度层数,in1024,out512,256,128,64,卷积、bn、rule,第一个down再relu之后加了resnet.
self.head_pose
#in512,out1024,128
self.gen_block1



down层输入为1024,感觉是两个512的feature级联起来的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def forward(self, face, depth):
#face.shape BX3X224X224 depth.shape BX1X64X64
face_f1 = self.face_block1(face)
#face_f1.shape BX64X224X224
face_f2 = self.face_block2(face_f1)
#face_f2.shape BX128X112X112
face_f3 = self.face_block3(face_f2)
#face_f3.shape BX256X56X56
face_f4 = self.face_block4(face_f3)
#face_f4.shape BX512X28X28
depth_f1 = self.depth_block1(depth)
depth_f2 = self.depth_block2(depth_f1)
depth_f3 = self.depth_block3(depth_f2)
depth_f4 = self.depth_block4(depth_f3)

#不同尺度的特征级联
mixed_f4 = self.down1(th.cat([face_f4, depth_f4], dim=1))
#mixed_f4.shape BX512X28X28
mixed_f3 = self.down2(th.cat([face_f3, depth_f3], dim=1))
#mixed_f3.shape BX256X56X56
mixed_f2 = self.down3(th.cat([face_f2, depth_f2], dim=1))
#mixed_f2.shape BX128X112X112
mixed_f1 = self.down4(th.cat([face_f1, depth_f1], dim=1))
#mixed_f1.shape BX64X224X224

#对于小尺度的特征上采样,做特征金字塔
gen_f3 = self.gen_block1(mixed_f4) + mixed_f3#zff: fpn
#gen_f3.shape BX256X56X56
gen_f2 = self.gen_block2(gen_f3) + mixed_f2
#gen_f2.shape BX128X112X112
gen_f1 = self.gen_block3(gen_f2) + mixed_f1
#gen_f1.shape BX64X224X224
gen_depth = self.gen_block4(gen_f1)
#gen_f1.shape BX1X224X224
head_pose = self.head_pose(mixed_f4)
#head_pose.shape BX128X1x1
return head_pose.view(head_pose.size(0), -1), gen_depth
#head_pose.shape(BX128)

GAZE/RGBD-Gaze-master/code/data/gaze_dataset.py

27英寸,长60厘米,宽34厘米(精确值:59.77厘米,33.62厘米)

在trainer_aaai.py调用时候

1
2
transformed_train_dataset = GazePointAllDataset(root_dir=self.data_root,transform=data_transforms['train'],phase='train',face_image=True, face_depth=True, eye_image=True,eye_depth=True,info=True, eye_bbox=True, face_bbox=True, eye_coord=True)
#参数有face_image、depth、bbox,eye_image、eye_depth、eye_bbox、face_bbox、info、eye_coord)
1
2
class GazePointAllDataset(data.Dataset):
def __init__(self, root_dir, w_screen=59.77, h_screen=33.62, transform=None, phase="train", **kwargs):

程序错误:数据集train_csv没有left_eye_coord、lift_eye_coord,感觉是这个类调用有问题,或者是类的构造有问题。导致加载数据出错。

数据集结构

数据集由165231个RGB/depth图像对组成。使用159个参与者对应的图像(119,318个RGB/depth图像对)作为训练数据,使用其余59个参与者对应的数据(45,913个RGB/depth图像对)作为测试数据。

image-20211025183820769

tran_meta.csv包含了119,318个RGB/depth图像对,表头有12项,分别为

A.图片index。

B.face_images图片存的位置,位于color文件夹下。

C.face_depth存的位置,位于projected_depth_calibration。

D.face_bbox,存于txt文件中,为4个坐标值,位于color文件夹下。

E.left_eye_image,位于color文件夹下。

F.right_eye_image,位于color文件夹下。

G.left_eye_depth,位于projected_depth_calibration下。

H.right_eye_depth,位于projected_depth_calibration下。

I.left_eye_bbox,位于color文件夹下。

J.right_eye_bbox,位于color文件夹下。

K.gaze_point,位于coordinate文件夹下,文件名以.npy结尾,文件中储存的是坐标。

L.has_landmark,值为TRUE或者FALSE,大部分值都为TRUE。

color、projected_depth_calibration、coordinate文件夹均有219个文件夹对应219个志愿者,每个志愿者的文件夹下有多组实验的数据。

color文件夹下存放了219个志愿者的rgb相关信息,一组实验包含7个信息,全脸、左右眼的图片以及bbox,还有人脸的68点landmark。

image-20211025195154291

projected_depth_calibration文件夹下存放了219个志愿者的depth相关信息,一组实验包含3个信息,分别为左右眼和全脸depth。

coordinate文件夹下存放了219个志愿者眼睛坐标系。

少了 le 坐标系,r坐标系。不知道如果这俩怎么在网络中使用?如果拿掉会怎么样。.mat格式里边是不是保存有?python转换一个,mat文件至excel看一下格式吧

读取数据集遇到的问题

企图通过各个特征维度来获得 le cord 和ri cord是啥?/(ㄒoㄒ)/~~

image-20211118165313972

resnet 34返回特征 512维度。(经过pooling,每个维度只有一个点)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def forward(self, face, depth):
face_f1 = self.face_block1(face)#3->64卷积、bn、relu
face_f2 = self.face_block2(face_f1)#64->128
face_f3 = self.face_block3(face_f2)#128->256
face_f4 = self.face_block4(face_f3)#256->512 28X28
depth_f1 = self.depth_block1(depth)#1->64
depth_f2 = self.depth_block2(depth_f1)#64->128
depth_f3 = self.depth_block3(depth_f2)#128->256
depth_f4 = self.depth_block4(depth_f3)#256->512 28X28
mixed_f4 = self.down1(th.cat([face_f4, depth_f4], dim=1))#1024->512
mixed_f3 = self.down2(th.cat([face_f3, depth_f3], dim=1))#512->256
mixed_f2 = self.down3(th.cat([face_f2, depth_f2], dim=1))#256->128
mixed_f1 = self.down4(th.cat([face_f1, depth_f1], dim=1))#128->64 28X28

gen_f3 = self.gen_block1(mixed_f4) + mixed_f3#512->256反卷积 56X56
gen_f2 = self.gen_block2(gen_f3) + mixed_f2#256->128 反卷积112X112
gen_f1 = self.gen_block3(gen_f2) + mixed_f1#128->64 反卷积224X224
gen_depth = self.gen_block4(gen_f1)#64->1
#和论文中的框图不一样,不过维度和 synthesize depth 对上了

head_pose = self.head_pose(mixed_f4)#512,28X28->512,14X14->1024,7X7->1024,1X1->128,1X1
return head_pose.view(head_pose.size(0), -1), gen_depth#6276. 1,224X224

似乎找到问题的答案啦!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#train_aaai.py
for j, lb in enumerate(left_eye_bbox):
cur_depth = refined_depth[j, :, int(lb[1]):int(lb[3]), int(lb[0]):int(lb[2])]
left_eye_info[j, 2] = th.median(cur_depth).item() * face_factor
for j, rb in enumerate(right_eye_bbox):
cur_depth = refined_depth[j, :, int(rb[1]):int(rb[3]), int(rb[0]):int(rb[2])]
right_eye_info[j, 2] = th.median(cur_depth).item() * face_factor

#gaze_aaai.py
l_coord = self.lcoord(th.cat([l_coord_feat, head_pose, linfo], 1))
r_coord = self.rcoord(th.cat([r_coord_feat, head_pose, rinfo], 1))

rinfo最终concatenate的有三个信息。depth均值和眼睛中间点级联了xe,ye

使用眼边缘六个点做平均获得眼睛中心位置。

image-20211122151234305

image-20211122151515079

屏幕分辨率1080(540),1960(980)。对于一个图片矩阵,(0,0)在左上,所以这里显示的landmark图片是upside down的。

处理

读出保存数据的csv,计算landmark眼周六点的平均值,获得双眼中心,保存至csv。对于landmark不存在的数据,从保存数据的csv中删除。

测试集没有删除数据,训练集合数据从119317删除至98629。

程序中对于gt处理,为什么有一个screen/2平移

w_screen=59.77, h_screen=33.62

​ gt[0] -= self.w_screen / 2

​ gt[1] -= self.h_screen / 2

为什么程序有一个减法?相当于平移。

image-20211122145542394

color00008_face

color00009_face

上海科大的深度图的动态范围是啥?太黑了。

用了numpy和opencv读全都是黑乎乎的。

/home/workspace/feifeizhang/GAZE/RGBD-Gaze-master/SHtechSave/results/gaze_aaai_refine_headpose/val/depth

image-20220410220254398ep00iter0000_rf.png

看一下怎么保存的吧。动态范围