文章目录
材料准备一、统计广告TopN1.1统计每⼀个省份点击TOP3的⼴告ID1.2统计每一个省份每一个小时的TOP3广告ID
二、基站停留时间TopN三、ip地址统计
材料准备
新建一个object文件存放所有案例文件的路径对象
object ExampleConstants
{
val PATH_ADS
: String = "...\\案例数据\\统计广告ID\\ad.log"
val PATH_LAC
: String = "...\\案例数据\\基站停留时间TopN"
val PATH_LAC_INFO
: String = "...\\案例数据\\基站停留时间TopN\\lac_info"
val PATH_LAC_LOG
: String = "...\\案例数据\\基站停留时间TopN\\*.log"
val PATH_IP_LOG
: String = "...\\案例数据\\IP地址统计\\http.log"
val PATH_IP_IP
: String = "...\\案例数据\\IP地址统计\\ip.txt"
}
一、统计广告TopN
数据格式:
timestamp province city userid adid
时间点 省份 城市 用户 广告
1601289641925 6 5 74 4
1601289650719 5 5 90 4
1601289667128 4 1 19 10
1601289655299 1 6 74 2
1601289637363 7 8 59 9
1601289656550 3 1 74 1
1601289657010 4 7 43 1
1601289664045 5 8 53 5
用户id范围 0-99
省份,城市,id范围:0-9
adid范围:0-19
1.1统计每⼀个省份点击TOP3的⼴告ID
import org
.apache
.spark
.rdd
.RDD
import org
.apache
.spark
.{SparkConf
, SparkContext
}
object _01_Ads
{
val sc
: SparkContext
= new SparkContext
(new SparkConf
().setMaster
("local").setAppName
("Ads"))
def main
(args
: Array
[String]): Unit = {
val rdd
: RDD
[String] = sc
.textFile
(ExampleConstants
.PATH_ADS
)
val rdd1
: RDD
[((String, String), Int)] = rdd
.map
(line
=> {
val parts
: Array
[String] = line
.split
("[ \t]+")
((parts
(1), parts
(4)), 1)
})
val rdd2
: RDD
[((String, String), Int)] = rdd1
.reduceByKey
(_
+ _
)
val rdd3
: RDD
[(String, (String, Int))] = rdd2
.map
(t
=> (t
._1
._1
, (t
._1
._2
, t
._2
)))
val rdd4
: RDD
[(String, Iterable
[(String, Int)])] = rdd3
.groupByKey
(1)
rdd4
.foreach
(println
)
val resRDD
: RDD
[(String, List
[(String, Int)])] = rdd4
.mapValues
(it
=> it
.toList
.sortWith
(_
._2
> _
._2
).take
(3))
resRDD
.foreach
(println
)
}
}
1.2统计每一个省份每一个小时的TOP3广告ID
import java
.util
.{Calendar
, Date
}
import org
.apache
.spark
.rdd
.RDD
import org
.apache
.spark
.{SparkConf
, SparkContext
}
object _02_Ads
{
def main
(args
: Array
[String]): Unit = {
val sc
: SparkContext
= new SparkContext
(new SparkConf
().setMaster
("local").setAppName
("Ads"))
val rdd
: RDD
[String] = sc
.textFile
(ExampleConstants
.PATH_ADS
)
val rdd2
: RDD
[((String, String, String), Int)] = rdd
.map
(line
=> {
val parts
: Array
[String] = line
.split
("[ \t]+")
((parts
(1), getHour
(parts
(0)), parts
(4)), 1)
})
val rdd3
: RDD
[((String, String, String), Int)] = rdd2
.reduceByKey
(_
+ _
)
val rdd4
: RDD
[((String, String), Iterable
[(String, Int)])] = rdd3
.map
(t
=> ((t
._1
._1
, t
._1
._2
), (t
._1
._3
, t
._2
))).groupByKey
()
val res
: RDD
[((String, String), List
[(String, Int)])] = rdd4
.mapValues
(it
=> it
.toList
.sortWith
(_
._2
> _
._2
).take
(3))
res
.foreach
(println
)
}
def getHour
(timestamp
: String): String = {
val calendar
: Calendar
= Calendar
.getInstance
()
calendar
.setTimeInMillis
(timestamp
.toLong
)
f
"${calendar.get(Calendar.YEAR)}/${calendar.get(Calendar.MONTH) + 1}d/${calendar.get(Calendar.DAY_OF_MONTH)}d/${calendar.get(Calendar.HOUR_OF_DAY)}d/${calendar.get(Calendar.MINUTE)}d"
}
def timeTransform
(time
:String): String ={
val sdf
: SimpleDateFormat
= new SimpleDateFormat
("yyyy-MM-dd HH")
val res
: String = sdf
.format
(new Date
(time
.toLong
))
res
}
二、基站停留时间TopN
根据用户产生日志的信息(*.log),统计在哪个基站停留时间最长
文件组成: 手机号,时间戳,基站ID,连接状态(1连接,0断开)
lac_info.txt文件中存储基站信息
文件组成:基站ID,经度,纬度
思路:
1.获取⽤户产⽣的⽇志信息并切分
2.⽤户在基站停留的总时⻓
3.获取基站的基础信息
4.把经纬度的信息join到⽤户数据中
5.求出⽤户在某些基站停留的时间top2
a.log
18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1
18611132889,20160327082500,16030401EAFB68F1E3CDF819735E1C66,1
18688888888,20160327170000,16030401EAFB68F1E3CDF819735E1C66,0
18611132889,20160327180000,16030401EAFB68F1E3CDF819735E1C66,0
b.log
18611132889,20160327075000,9F36407EAD0629FC166F14DDE7970F68,1
18688888888,20160327075100,9F36407EAD0629FC166F14DDE7970F68,1
18611132889,20160327081000,9F36407EAD0629FC166F14DDE7970F68,0
18688888888,20160327081300,9F36407EAD0629FC166F14DDE7970F68,0
18688888888,20160327175000,9F36407EAD0629FC166F14DDE7970F68,1
18611132889,20160327182000,9F36407EAD0629FC166F14DDE7970F68,1
18688888888,20160327220000,9F36407EAD0629FC166F14DDE7970F68,0
18611132889,20160327230000,9F36407EAD0629FC166F14DDE7970F68,0
c.log
18611132889,20160327081100,CC0710CC94ECC657A8561DE549D940E0,1
18688888888,20160327081200,CC0710CC94ECC657A8561DE549D940E0,1
18688888888,20160327081900,CC0710CC94ECC657A8561DE549D940E0,0
18611132889,20160327082000,CC0710CC94ECC657A8561DE549D940E0,0
18688888888,20160327171000,CC0710CC94ECC657A8561DE549D940E0,1
18688888888,20160327171600,CC0710CC94ECC657A8561DE549D940E0,0
18611132889,20160327180500,CC0710CC94ECC657A8561DE549D940E0,1
18611132889,20160327181500,CC0710CC94ECC657A8561DE549D940E0,0
loc.info
9F36407EAD0629FC166F14DDE7970F68,116.304864,40.050645,6
CC0710CC94ECC657A8561DE549D940E0,116.303955,40.041935,6
16030401EAFB68F1E3CDF819735E1C66,116.296302,40.032296,6
import org
.apache
.spark
.rdd
.RDD
import org
.apache
.spark
.{SparkConf
, SparkContext
}
object _03_Lac
{
def main
(args
: Array
[String]): Unit = {
val sc
: SparkContext
= new SparkContext
(new SparkConf
().setMaster
("local").setAppName
("lac"))
val rdd
: RDD
[String] = sc
.textFile
(ExampleConstants
.PATH_LAC_LOG
)
val rdd1
: RDD
[((String, String), Long)] = rdd
.map
(line
=> {
val parts
: Array
[String] = line
.split
(",")
val phone
: String = parts
(0)
val time
: Long = parts
(1).toLong
val lacId
: String = parts
(2)
val eventLog
: String = parts
(3)
val duration
: Long = if (eventLog
.equals
("1")) -time
else time
((phone
, lacId
), duration
)
})
val rdd2
: RDD
[((String, String), Long)] = rdd1
.reduceByKey
(_
+ _
)
val rdd3
: RDD
[(String, (String, Long))] = rdd2
.map
(t
=> (t
._1
._2
, (t
._1
._1
, t
._2
)))
val lacInfoRDD
: RDD
[String] = sc
.textFile
(ExampleConstants
.PATH_LAC_INFO
)
val lacInfoRDD1
: RDD
[(String, (String, String))] = lacInfoRDD
.map
(line
=> {
val infos
: Array
[String] = line
.split
(",")
val lacID
: String = infos
(0)
val x
: String = infos
(1)
val y
: String = infos
(2)
(lacID
, (x
, y
))
})
val rdd4
: RDD
[(String, ((String, Long), (String, String)))] = rdd3
.join
(lacInfoRDD1
)
val rdd5
: RDD
[(String, Long, (String, (String, String)))] = rdd4
.map
(t
=> (t
._2
._1
._1
, t
._2
._1
._2
, (t
._1
, t
._2
._2
)))
val rdd6
: RDD
[(String, Iterable
[(String, Long, (String, (String, String)))])] = rdd5
.groupBy
(_
._1
)
val rdd7
: RDD
[(String, List
[(String, Long, (String, (String, String)))])] = rdd6
.mapValues
(t
=> t
.toList
.sortWith
(_
._2
> _
._2
).take
(2))
val res
: RDD
[(String, List
[(String, String, String)])] = rdd7
.mapValues
(list
=> list
.map
(t
=> (t
._3
._1
, t
._3
._2
._1
, t
._3
._2
._2
)))
res
.foreach
(println
)
res
.saveAsTextFile
("")
}
}
三、ip地址统计
给出ip地址统计优化方法
日志信息
20090121000132095572000|125.213.100.123|show.51.com|/shoplist.php?phpfile=shoplist2.php&style=1&sex=137|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Mozilla/4.0(Compatible Mozilla/4.0(Compatible-EmbeddedWB 14.59 http://bsalsa.com/ EmbeddedWB- 14.59 from: http://bsalsa.com/ )|http://show.51.com/main.php|
20090121000132124542000|117.101.215.133|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/index.php?uidhash=d1c3b69e9b8355a5204474c749fb76ef|__tkist=0; myloc=50|5008; myage=2009; PROFILE=14469674:苦涩咖啡:m:photos2.love21cn.com/45/1b/388111afac8195cc5d91ea286cdd:1::http://images.love21cn.com/w4/global/i/hykj_m.jpg; last_login_time=1232454068; SESSION_HASH=8176b100a84c9a095315f916d7fcbcf10021e3af; RAW_HASH=008a1bc48ff9ebafa3d5b4815edd04e9e7978050; COMMON_HASH=45388111afac8195cc5d91ea286cdd1b; pop_1232093956=1232468896968; pop_time=1232466715734; pop_1232245908=1232469069390; pop_1219903726=1232477601937; LOVESESSID=98b54794575bf547ea4b55e07efa2e9e; main_search:14469674=|||00; registeruid=14469674; REG_URL_COOKIE=http://photo.jiayuan.com/showphoto.php?uid_hash=0319bc5e33ba35755c30a9d88aaf46dc&total=6&p=5; click_count=0,3363619
20090121000132406516000|117.101.222.68|gg.xiaonei.com|/view.jsp?p=389|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; CIBA)|http://home.xiaonei.com/Home.do?id=229670724|_r01_=1; __utma=204579609.31669176.1231940225.1232462740.1232467011.145; __utmz=204579609.1231940225.1.1.utmccn=(direct)
20090121000132581311000|115.120.36.118|tj.tt98.com|/tj.htm|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TheWorld)|http://www.tt98.com/|
20090121000132864647000|123.197.64.247|cul.sohu.com|/20071227/n254338813_22.shtml|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TheWorld)|http://cul.sohu.com/20071227/n254338813_22.shtml|ArticleTab=visit:1; IPLOC=unknown; SUV=0901080709152121; vjuids=832dd37a1.11ebbc5d590.0.b20f858f14e918; club_chat_ircnick=JaabvxC4aaacQ; spanel={"u":""}; vjlast=1232467312,1232467312,30
20090121000133296729000|222.55.57.176|down.chinaz.com|/|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; iCafeMedia; TencentTraveler 4.0)||cnzz_a33219=0; vw33219=:18167791:; sin33219=http://www.itxls.com/wz/wyfx/it.html; rtime=0; ltime=1232464387281; cnzz_eid=6264952-1232464379-http://www.itxls.com/wz/wyfx/it.html
20090121000133331104000|123.197.66.93|www.pkwutai.cn|/down/downLoad-id-45383.html|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7)|http://www.baidu.com/s?tn=b1ank_pg&ie=gb2312&bs=