相关链接:
- TensorFlow 官网
- Real-time Human Pose Estimation in the Browser with TensorFlow.js
- Github-posenet
- Demo
- Move Mirror: An AI Experiment with Pose Estimation in the Browser using TenserFlow.js
关于加载 Pre-trained PoseNet 模型
- MobileNet: smaller, faster, less accurate;
- ResNet: larger, slower, more accurate;
- 详细参数的说明请参考上面的 Github-posenet 及 Demo;
关于模型本地化的方法,可以参照如下方式:
- 打开 PoseNet Demo 页面;
- 调整参数达到期望的效果;
- 在开发者工具中找到对应的资源,保存在本地
- json 和 bin 文件放在同一位置(本地同一文件夹中);
- 例如 ResNet50, outputStride: 32, quantBytes: 2 时,为 model-stride32.json 和 group1-shard1of23.bin ~ group1-shard23of23.bin;
- 参数不同时,对应的 json 和 bin 文件不同,请调整至合适的参数后下载;
posenet.load({ /* architecture: 'ResNet50', inputResolution: 250, outputStride: 32, quantBytes: 2, modelUrl: 'model/resnet50/model-stride32.json' */ architecture: 'MobileNetV1', inputResolution: 500, outputStride: 16, multiplier: 1, quantBytes: 4, modelUrl: 'model/mobilenetv1/model-stride16.json' }).then(net => { let pose = net.estimateSinglePose(image, { flipHorizontal: false }); return pose; }).then(pose => { });
关于姿势的匹配
请参考最上面 Move Mirror 文章,其中将主要用到的匹配技术阐述的比较清晰。简单总结如下
- 需要将匹配图片进行剪裁缩放处理:原因在于说,如果两个人姿势是基本一样的,但是确处于图片中的不同位置(例如一个比较靠近图片左边,一个比较靠近图片右边),那么他们的 Keypoints 中的坐标值还是会有很大的区别。推荐的做法是,根据 posenet 计算出的 boundingBox 中的 minX, maxX, minY, MaxY 剪裁出人物,然后绘制到统一尺寸的图片中心,然后将 Keypoints 中的坐标按照新图片中的位置从新计算。
- 进行 L2 normalization,这里我有个问题没有搞清楚:再进行 Consine Similarity 匹配时,similarity 方法以及包括了 l2norm ,那么文章中提到的 L2 normalization 是否指的就是 similarity 中的 l2norm 呢,还是说需要额外处理?
- 采用 cosine distance 的匹配方案:需要将采集数据和比对数据进行处理,长度 34 的坐标对数组;
- 采用 weighted matching 的匹配方案:需要将采集数据和比对数据进行处理,长度 52 的数组,0-33为坐标对,34-50为置信度,51为置信度之和;
- 匹配结果,数值越小越接近;
const boundingBox = posenet.getBoundingBox(pose.keypoints); const bx = boundingBox.minX; const by = boundingBox.minY; const bw = boundingBox.maxX - boundingBox.minX; const bh = boundingBox.maxY - boundingBox.minY; const w = 480, h = 480; let nx, ny, nw, nh, s; if(bw / bh > w / h) { s = w / bw; } else { s = h / bh; } nw = bw * s; nh = bh * s; nx = (w - nw) * 0.5; ny = (h - nh) * 0.5; ctx.rect(0, 0, w, h); ctx.fillStyle = 'grey'; ctx.fill(); ctx.drawImage(image, bx, by, bw, bh, nx, ny, nw, nh); let newKeypoints = []; for(let i = 0; i < pose.keypoints.length; i ++) { newKeypoints[i] = { score: pose.keypoints[i].score, part: pose.keypoints[i].part, position: { x: nx + (pose.keypoints[i].position.x - bx) * s, y: ny + (pose.keypoints[i].position.y - by) * s } }; }
let curData = [], matData = []; let idx = 0, ord = 0; for(let j = 0; j < 34; j ++) { curData[j] = (ord > 0) ? newKeypoints[idx].position.y : newKeypoints[idx].position.x; matData[j] = (ord > 0) ? matKeypoints[idx].position.y : matKeypoints[idx].position.x; ord ++; if(ord > 1) { ord = 0; idx ++; } } let cosineSimilarity = similarity(curData, matData); let distance = 2 * (1 - cosineSimilarity); console.log(Math.sqrt(distance));
let curKeypoints = [], matKeypoints = []; let curData = [], matData = []; let curSum = 0, matSum = 0; let idx = 0, ord = 0; curKeypoints = formatKeypoints(newKeypoints); matKeypoints = formatKeypoints(MATCH_KEYPOINTS); for(let j = 0; j < 52; j ++) { if(j < 34) { curData[j] = ord > 0 ? curKeypoints[idx].position.y : curKeypoints[idx].position.x; matData[j] = ord > 0 ? matKeypoints[idx].position.y : matKeypoints[idx].position.x; ord ++; if(ord > 1) { ord = 0; idx ++; } } else if(j >= 34 && j < 51) { if(j === 34) { idx = 0; } curData[j] = curKeypoints[idx].score; matData[j] = matKeypoints[idx].score; curSum += curKeypoints[idx].score; matSum += matKeypoints[idx].score; idx ++; } else { curData[j] = curSum; matData[j] = matSum; } } let vector1PoseXY = curData.slice(0, 34); let vector1Confidences = curData.slice(34, 51); let vector1ConfidenceSum = curData.slice(51, 52); let vector2PoseXY = matData.slice(0, 34); let summation1 = 1 / vector1ConfidenceSum; let summation2 = 0; for(let k = 0; k < vector1PoseXY.length; k ++) { let tempConf = Math.floor(k / 2); let tempSum = vector1Confidences[tempConf] * Math.abs(vector1PoseXY[k] - vector2PoseXY[k]); summation2 = summation2 + tempSum; } console.log(summation1 * summation2);